113 files changed, 12498 insertions, 429 deletions
diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog
index 74a135cd55b..df4e9370f81 100644
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,317 @@
+2015-10-13  Jakub Jelinek  <jakub@redhat.com>
+	    Aldy Hernandez  <aldyh@redhat.com>
+	    Ilya Verbin  <ilya.verbin@intel.com>
+
+	* config/linux/affinity.c (omp_get_place_num_procs,
+	omp_get_place_proc_ids, gomp_get_place_proc_ids_8): New functions.
+	* config/linux/doacross.h: New file.
+	* config/posix/affinity.c (omp_get_place_num_procs,
+	omp_get_place_proc_ids, gomp_get_place_proc_ids_8): New functions.
+	* config/posix/doacross.h: New file.
+	* env.c: Include gomp-constants.h.
+	(struct gomp_task_icv): Rename run_sched_modifier to
+	run_sched_chunk_size.
+	(gomp_max_task_priority_var): New variable.
+	(parse_schedule): Rename run_sched_modifier to run_sched_chunk_size.
+	(handle_omp_display_env): Change _OPENMP value from 201307 to
+	201511.  Print OMP_MAX_TASK_PRIORITY.
+	(initialize_env): Parse OMP_MAX_TASK_PRIORITY.
+	(omp_set_schedule, omp_get_schedule): Rename modifier argument to
+	chunk_size and run_sched_modifier to run_sched_chunk_size.
+	(omp_get_max_task_priority, omp_get_initial_device,
+	omp_get_num_places, omp_get_place_num, omp_get_partition_num_places,
+	omp_get_partition_place_nums): New functions.
+	* fortran.c (omp_set_schedule_, omp_set_schedule_8_,
+	omp_get_schedule_, omp_get_schedule_8_): Rename modifier argument
+	to chunk_size.
+	(omp_get_num_places_, omp_get_place_num_procs_,
+	omp_get_place_num_procs_8_, omp_get_place_proc_ids_,
+	omp_get_place_proc_ids_8_, omp_get_place_num_,
+	omp_get_partition_num_places_, omp_get_partition_place_nums_,
+	omp_get_partition_place_nums_8_, omp_get_initial_device_,
+	omp_get_max_task_priority_): New functions.
+	* libgomp_g.h (GOMP_loop_doacross_static_start,
+	GOMP_loop_doacross_dynamic_start, GOMP_loop_doacross_guided_start,
+	GOMP_loop_doacross_runtime_start, GOMP_loop_ull_doacross_static_start,
+	GOMP_loop_ull_doacross_dynamic_start,
+	GOMP_loop_ull_doacross_guided_start,
+	GOMP_loop_ull_doacross_runtime_start, GOMP_doacross_post,
+	GOMP_doacross_wait, GOMP_doacross_ull_post, GOMP_doacross_wait,
+	GOMP_taskloop, GOMP_taskloop_ull, GOMP_target_41,
+	GOMP_target_data_41, GOMP_target_update_41,
+	GOMP_target_enter_exit_data): New prototypes.
+	(GOMP_task): Add prototype argument.
+	* libgomp.h (_LIBGOMP_CHECKING_): Define to 0 if not yet defined.
+	(struct gomp_doacross_work_share): New type.
+	(struct gomp_work_share): Add doacross field.
+	(struct gomp_task_icv): Rename run_sched_modifier to
+	run_sched_chunk_size.
+	(enum gomp_task_kind): Rename GOMP_TASK_IFFALSE to
+	GOMP_TASK_UNDEFERRED.  Add comments.
+	(struct gomp_task_depend_entry): Add comments.
+	(struct gomp_task): Likewise.
+	(struct gomp_taskgroup): Likewise.
+	(struct gomp_target_task): New type.
+	(struct gomp_team): Add comment.
+	(gomp_get_place_proc_ids_8, gomp_doacross_init,
+	gomp_doacross_ull_init, gomp_task_maybe_wait_for_dependencies,
+	gomp_create_target_task, gomp_target_task_fn): New prototypes.
+	(struct target_var_desc): New type.
+	(struct target_mem_desc): Adjust comment.  Use struct
+	target_var_desc instead of splay_tree_key for list.
+	(REFCOUNT_INFINITY): Define.
+	(struct splay_tree_key_s): Remove copy_from field.
+	(struct gomp_device_descr): Add dev2dev_func field.
+	(enum gomp_map_vars_kind): New enum.
+	(gomp_map_vars): Add one argument.
+	* libgomp.map (OMP_4.5): Export omp_get_max_task_priority,
+	omp_get_max_task_priority_, omp_get_num_places, omp_get_num_places_,
+	omp_get_place_num_procs, omp_get_place_num_procs_,
+	omp_get_place_num_procs_8_, omp_get_place_proc_ids,
+	omp_get_place_proc_ids_, omp_get_place_proc_ids_8_, omp_get_place_num,
+	omp_get_place_num_, omp_get_partition_num_places,
+	omp_get_partition_num_places_, omp_get_partition_place_nums,
+	omp_get_partition_place_nums_, omp_get_partition_place_nums_8_,
+	omp_get_initial_device, omp_get_initial_device_, omp_target_alloc,
+	omp_target_free, omp_target_is_present, omp_target_memcpy,
+	omp_target_memcpy_rect, omp_target_associate_ptr and
+	omp_target_disassociate_ptr.
+	(GOMP_4.0.2): Renamed to ...
+	(GOMP_4.5): ... this.  Export GOMP_target_41, GOMP_target_data_41,
+	GOMP_target_update_41, GOMP_target_enter_exit_data, GOMP_taskloop,
+	GOMP_taskloop_ull, GOMP_loop_doacross_dynamic_start,
+	GOMP_loop_doacross_guided_start, GOMP_loop_doacross_runtime_start,
+	GOMP_loop_doacross_static_start, GOMP_doacross_post,
+	GOMP_doacross_wait, GOMP_loop_ull_doacross_dynamic_start,
+	GOMP_loop_ull_doacross_guided_start,
+	GOMP_loop_ull_doacross_runtime_start,
+	GOMP_loop_ull_doacross_static_start, GOMP_doacross_ull_post and
+	GOMP_doacross_ull_wait.
+	* libgomp.texi: Document omp_get_max_task_priority.
+	Rename modifier argument to chunk_size for omp_set_schedule and
+	omp_get_schedule.  Document OMP_MAX_TASK_PRIORITY env var.
+	* loop.c (GOMP_loop_runtime_start): Adjust for run_sched_modifier
+	to run_sched_chunk_size renaming.
+	(GOMP_loop_ordered_runtime_start): Likewise.
+	(gomp_loop_doacross_static_start, gomp_loop_doacross_dynamic_start,
+	gomp_loop_doacross_guided_start, GOMP_loop_doacross_runtime_start,
+	GOMP_parallel_loop_runtime_start): New functions.
+	(GOMP_parallel_loop_runtime): Adjust for run_sched_modifier
+	to run_sched_chunk_size renaming.
+	(GOMP_loop_doacross_static_start, GOMP_loop_doacross_dynamic_start,
+	GOMP_loop_doacross_guided_start): New functions or aliases.
+	* loop_ull.c (GOMP_loop_ull_runtime_start): Adjust for
+	run_sched_modifier to run_sched_chunk_size renaming.
+	(GOMP_loop_ull_ordered_runtime_start): Likewise.
+	(gomp_loop_ull_doacross_static_start,
+	gomp_loop_ull_doacross_dynamic_start,
+	gomp_loop_ull_doacross_guided_start,
+	GOMP_loop_ull_doacross_runtime_start): New functions.
+	(GOMP_loop_ull_doacross_static_start,
+	GOMP_loop_ull_doacross_dynamic_start,
+	GOMP_loop_ull_doacross_guided_start): New functions or aliases.
+	* oacc-mem.c (acc_map_data, present_create_copy,
+	gomp_acc_insert_pointer): Pass GOMP_MAP_VARS_OPENACC instead of false
+	to gomp_map_vars.
+	(gomp_acc_remove_pointer): Use copy_from from target_var_desc.
+	* oacc-parallel.c (GOACC_data_start): Pass GOMP_MAP_VARS_OPENACC
+	instead of false to gomp_map_vars.
+	(GOACC_parallel_keyed): Likewise.  Use copy_from from target_var_desc.
+	* omp.h.in (omp_lock_hint_t): New type.
+	(omp_init_lock_with_hint, omp_init_nest_lock_with_hint,
+	omp_get_num_places, omp_get_place_num_procs, omp_get_place_proc_ids,
+	omp_get_place_num, omp_get_partition_num_places,
+	omp_get_partition_place_nums, omp_get_initial_device,
+	omp_get_max_task_priority, omp_target_alloc, omp_target_free,
+	omp_target_is_present, omp_target_memcpy, omp_target_memcpy_rect,
+	omp_target_associate_ptr, omp_target_disassociate_ptr): New
+	prototypes.
+	* omp_lib.f90.in (omp_lock_hint_kind): New parameter.
+	(omp_lock_hint_none, omp_lock_hint_uncontended,
+	omp_lock_hint_contended, omp_lock_hint_nonspeculative,
+	omp_lock_hint_speculative): New parameters.
+	(omp_init_lock_with_hint, omp_init_nest_lock_with_hint,
+	omp_get_num_places, omp_get_place_num_procs, omp_get_place_proc_ids,
+	omp_get_place_num, omp_get_partition_num_places,
+	omp_get_partition_place_nums, omp_get_initial_device,
+	omp_get_max_task_priority): New interfaces.
+	(omp_set_schedule, omp_get_schedule): Rename modifier argument
+	to chunk_size.
+	* omp_lib.h.in (omp_lock_hint_kind): New parameter.
+	(omp_lock_hint_none, omp_lock_hint_uncontended,
+	omp_lock_hint_contended, omp_lock_hint_nonspeculative,
+	omp_lock_hint_speculative): New parameters.
+	(omp_init_lock_with_hint, omp_init_nest_lock_with_hint,
+	omp_get_num_places, omp_get_place_num_procs, omp_get_place_proc_ids,
+	omp_get_place_num, omp_get_partition_num_places,
+	omp_get_partition_place_nums, omp_get_initial_device,
+	omp_get_max_task_priority): New functions and subroutines.
+	* ordered.c: Include stdarg.h and string.h.
+	(MAX_COLLAPSED_BITS): Define.
+	(gomp_doacross_init, GOMP_doacross_post, GOMP_doacross_wait,
+	gomp_doacross_ull_init, GOMP_doacross_ull_post,
+	GOMP_doacross_ull_wait): New functions.
+	* target.c: Include errno.h.
+	(resolve_device): If device is not initialized, call
+	gomp_init_device on it.
+	(gomp_map_lookup): New function.
+	(gomp_map_vars_existing): Add tgt_var argument, fill it in.
+	Don't bump refcount if REFCOUNT_INFINITY.  Handle
+	GOMP_MAP_ALWAYS_TO_P.
+	(get_kind): Rename is_openacc argument to short_mapkind.
+	(gomp_map_pointer): Use gomp_map_lookup.
+	(gomp_map_fields_existing): New function.
+	(gomp_map_vars): Rename is_openacc argument to short_mapkind
+	and is_target to pragma_kind.  Handle GOMP_MAP_VARS_ENTER_DATA,
+	handle GOMP_MAP_FIRSTPRIVATE_INT, GOMP_MAP_STRUCT,
+	GOMP_MAP_USE_DEVICE_PTR, GOMP_MAP_ZERO_LEN_ARRAY_SECTION.
+	Adjust for tgt->list changed type and copy_from living in there.
+	(gomp_copy_from_async): Adjust for tgt->list changed type and
+	copy_from living in there.
+	(gomp_unmap_vars): Likewise.
+	(gomp_update): Likewise.  Rename is_openacc argument to
+	short_mapkind.  Don't fail if object is not mapped.
+	(gomp_load_image_to_device): Initialize refcount to
+	REFCOUNT_INFINITY.
+	(gomp_target_fallback): New function.
+	(gomp_get_target_fn_addr): Likewise.
+	(GOMP_target): Adjust gomp_map_vars caller, use
+	gomp_get_target_fn_addr and gomp_target_fallback.
+	(GOMP_target_41): New function.
+	(gomp_target_data_fallback): New function.
+	(GOMP_target_data): Use it, adjust gomp_map_vars caller.
+	(GOMP_target_data_41): New function.
+	(GOMP_target_update): Adjust gomp_update caller.
+	(GOMP_target_update_41): New function.
+	(gomp_exit_data, GOMP_target_enter_exit_data,
+	gomp_target_task_fn, omp_target_alloc, omp_target_free,
+	omp_target_is_present, omp_target_memcpy,
+	omp_target_memcpy_rect_worker, omp_target_memcpy_rect,
+	omp_target_associate_ptr, omp_target_disassociate_ptr,
+	gomp_load_plugin_for_device): New functions.
+	* task.c: Include gomp-constants.h.  Include taskloop.c
+	twice to get GOMP_taskloop and GOMP_taskloop_ull definitions.
+	(gomp_task_handle_depend): New function.
+	(GOMP_task): Use it.  Add priority argument.  Use
+	gomp-constant.h constants instead of hardcoded numbers.
+	Rename GOMP_TASK_IFFALSE to GOMP_TASK_UNDEFERRED.
+	(gomp_create_target_task): New function.
+	(verify_children_queue, verify_taskgroup_queue,
+	verify_task_queue): New functions.
+	(gomp_task_run_pre): Call verify_*_queue functions.
+	If an upcoming tied task is about to leave the sibling or
+	taskgroup queues in an invalid state, adjust appropriately.
+	Remove taskgroup argument.  Add comments.
+	(gomp_task_run_post_handle_dependers): Add comments.
+	(gomp_task_run_post_remove_parent): Likewise.
+	(gomp_barrier_handle_tasks): Adjust gomp_task_run_pre caller.
+	(GOMP_taskwait): Likewise.  Add comments.
+	(gomp_task_maybe_wait_for_dependencies): Fix scheduling
+	problem such that the first non parent_depends_on task does not
+	end up at the end of the children queue.
+	(GOMP_taskgroup_start): Rename GOMP_TASK_IFFALSE to
+	GOMP_TASK_UNDEFERRED.
+	(GOMP_taskgroup_end): Adjust gomp_task_run_pre caller.
+	* taskloop.c: New file.
+	* testsuite/lib/libgomp.exp
+	(check_effective_target_offload_device_nonshared_as): New proc.
+	* testsuite/libgomp.c/affinity-2.c: New test.
+	* testsuite/libgomp.c/doacross-1.c: New test.
+	* testsuite/libgomp.c/doacross-2.c: New test.
+	* testsuite/libgomp.c/examples-4/declare_target-1.c (fib_wrapper):
+	Add map clause to target.
+	* testsuite/libgomp.c/examples-4/declare_target-4.c (accum): Likewise.
+	* testsuite/libgomp.c/examples-4/declare_target-5.c (accum): Likewise.
+	* testsuite/libgomp.c/examples-4/device-1.c (main): Likewise.
+	* testsuite/libgomp.c/examples-4/device-3.c (main): Likewise.
+	* testsuite/libgomp.c/examples-4/target_data-3.c (gramSchmidt):
+	Likewise.
+	* testsuite/libgomp.c/examples-4/teams-2.c (dotprod): Likewise.
+	* testsuite/libgomp.c/examples-4/teams-3.c (dotprod): Likewise.
+	* testsuite/libgomp.c/examples-4/teams-4.c (dotprod): Likewise.
+	* testsuite/libgomp.c/for-2.h (OMPTGT, OMPTO, OMPFROM): Define if
+	not defined.  Use those where needed.
+	* testsuite/libgomp.c/for-4.c: New test.
+	* testsuite/libgomp.c/for-5.c: New test.
+	* testsuite/libgomp.c/for-6.c: New test.
+	* testsuite/libgomp.c/linear-1.c: New test.
+	* testsuite/libgomp.c/ordered-4.c: New test.
+	* testsuite/libgomp.c/pr66199-2.c (f2): Adjust for linear clause
+	only allowed on the loop iterator.
+	* testsuite/libgomp.c/pr66199-3.c: New test.
+	* testsuite/libgomp.c/pr66199-4.c: New test.
+	* testsuite/libgomp.c/reduction-7.c: New test.
+	* testsuite/libgomp.c/reduction-8.c: New test.
+	* testsuite/libgomp.c/reduction-9.c: New test.
+	* testsuite/libgomp.c/reduction-10.c: New test.
+	* testsuite/libgomp.c/target-1.c (fn2, fn3, fn4): Add
+	map(tofrom:s).
+	* testsuite/libgomp.c/target-2.c (fn2, fn3, fn4): Likewise.
+	* testsuite/libgomp.c/target-7.c (foo): Add map(h) where needed.
+	* testsuite/libgomp.c/target-11.c: New test.
+	* testsuite/libgomp.c/target-12.c: New test.
+	* testsuite/libgomp.c/target-13.c: New test.
+	* testsuite/libgomp.c/target-14.c: New test.
+	* testsuite/libgomp.c/target-15.c: New test.
+	* testsuite/libgomp.c/target-16.c: New test.
+	* testsuite/libgomp.c/target-17.c: New test.
+	* testsuite/libgomp.c/target-18.c: New test.
+	* testsuite/libgomp.c/target-19.c: New test.
+	* testsuite/libgomp.c/target-20.c: New test.
+	* testsuite/libgomp.c/target-21.c: New test.
+	* testsuite/libgomp.c/target-22.c: New test.
+	* testsuite/libgomp.c/target-23.c: New test.
+	* testsuite/libgomp.c/target-24.c: New test.
+	* testsuite/libgomp.c/target-25.c: New test.
+	* testsuite/libgomp.c/target-26.c: New test.
+	* testsuite/libgomp.c/target-27.c: New test.
+	* testsuite/libgomp.c/taskloop-1.c: New test.
+	* testsuite/libgomp.c/taskloop-2.c: New test.
+	* testsuite/libgomp.c/taskloop-3.c: New test.
+	* testsuite/libgomp.c/taskloop-4.c: New test.
+	* testsuite/libgomp.c++/ctor-13.C: New test.
+	* testsuite/libgomp.c++/doacross-1.C: New test.
+	* testsuite/libgomp.c++/examples-4/declare_target-2.C:
+	Replace offload_device with offload_device_nonshared_as.
+	* testsuite/libgomp.c++/for-12.C: New test.
+	* testsuite/libgomp.c++/for-13.C: New test.
+	* testsuite/libgomp.c++/for-14.C: New test.
+	* testsuite/libgomp.c++/linear-1.C: New test.
+	* testsuite/libgomp.c++/member-1.C: New test.
+	* testsuite/libgomp.c++/member-2.C: New test.
+	* testsuite/libgomp.c++/member-3.C: New test.
+	* testsuite/libgomp.c++/member-4.C: New test.
+	* testsuite/libgomp.c++/member-5.C: New test.
+	* testsuite/libgomp.c++/ordered-1.C: New test.
+	* testsuite/libgomp.c++/reduction-5.C: New test.
+	* testsuite/libgomp.c++/reduction-6.C: New test.
+	* testsuite/libgomp.c++/reduction-7.C: New test.
+	* testsuite/libgomp.c++/reduction-8.C: New test.
+	* testsuite/libgomp.c++/reduction-9.C: New test.
+	* testsuite/libgomp.c++/reduction-10.C: New test.
+	* testsuite/libgomp.c++/reference-1.C: New test.
+	* testsuite/libgomp.c++/simd14.C: New test.
+	* testsuite/libgomp.c++/target-2.C (fn2): Add map(tofrom: s) clause.
+	* testsuite/libgomp.c++/target-5.C: New test.
+	* testsuite/libgomp.c++/target-6.C: New test.
+	* testsuite/libgomp.c++/target-7.C: New test.
+	* testsuite/libgomp.c++/target-8.C: New test.
+	* testsuite/libgomp.c++/target-9.C: New test.
+	* testsuite/libgomp.c++/target-10.C: New test.
+	* testsuite/libgomp.c++/target-11.C: New test.
+	* testsuite/libgomp.c++/target-12.C: New test.
+	* testsuite/libgomp.c++/taskloop-1.C: New test.
+	* testsuite/libgomp.c++/taskloop-2.C: New test.
+	* testsuite/libgomp.c++/taskloop-3.C: New test.
+	* testsuite/libgomp.c++/taskloop-4.C: New test.
+	* testsuite/libgomp.c++/taskloop-5.C: New test.
+	* testsuite/libgomp.c++/taskloop-6.C: New test.
+	* testsuite/libgomp.c++/taskloop-7.C: New test.
+	* testsuite/libgomp.c++/taskloop-8.C: New test.
+	* testsuite/libgomp.c++/taskloop-9.C: New test.
+	* testsuite/libgomp.fortran/affinity1.f90: New test.
+	* testsuite/libgomp.fortran/affinity2.f90: New test.
+
 2015-10-13  Tom de Vries  <tom@codesourcery.com>
 
 	PR tree-optimization/67476
diff --git a/libgomp/config/linux/affinity.c b/libgomp/config/linux/affinity.c
index 17b65afb49a..775ee0a7fdf 100644
--- a/libgomp/config/linux/affinity.c
+++ b/libgomp/config/linux/affinity.c
@@ -353,6 +353,45 @@ gomp_affinity_print_place (void *p)
     fprintf (stderr, ":%lu", len);
 }
 
+int
+omp_get_place_num_procs (int place_num)
+{
+  if (place_num < 0 || place_num >= gomp_places_list_len)
+    return 0;
+
+  cpu_set_t *cpusetp = (cpu_set_t *) gomp_places_list[place_num];
+  return gomp_cpuset_popcount (gomp_cpuset_size, cpusetp);
+}
+
+void
+omp_get_place_proc_ids (int place_num, int *ids)
+{
+  if (place_num < 0 || place_num >= gomp_places_list_len)
+    return;
+
+  cpu_set_t *cpusetp = (cpu_set_t *) gomp_places_list[place_num];
+  unsigned long i, max = 8 * gomp_cpuset_size;
+  for (i = 0; i < max; i++)
+    if (CPU_ISSET_S (i, gomp_cpuset_size, cpusetp))
+      *ids++ = i;
+}
+
+void
+gomp_get_place_proc_ids_8 (int place_num, int64_t *ids)
+{
+  if (place_num < 0 || place_num >= gomp_places_list_len)
+    return;
+
+  cpu_set_t *cpusetp = (cpu_set_t *) gomp_places_list[place_num];
+  unsigned long i, max = 8 * gomp_cpuset_size;
+  for (i = 0; i < max; i++)
+    if (CPU_ISSET_S (i, gomp_cpuset_size, cpusetp))
+      *ids++ = i;
+}
+
+ialias(omp_get_place_num_procs)
+ialias(omp_get_place_proc_ids)
+
 #else
 
 #include "../posix/affinity.c"
diff --git a/libgomp/config/linux/doacross.h b/libgomp/config/linux/doacross.h
new file mode 100644
index 00000000000..7a5a645f3cf
--- /dev/null
+++ b/libgomp/config/linux/doacross.h
@@ -0,0 +1,57 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Jakub Jelinek <jakub@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is a Linux specific implementation of doacross spinning.  */
+
+#ifndef GOMP_DOACROSS_H
+#define GOMP_DOACROSS_H 1
+
+#include "libgomp.h"
+#include <errno.h>
+#include "wait.h"
+
+#ifdef HAVE_ATTRIBUTE_VISIBILITY
+# pragma GCC visibility push(hidden)
+#endif
+
+static inline void doacross_spin (unsigned long *addr, unsigned long expected,
+				  unsigned long cur)
+{
+  /* FIXME: back off depending on how large expected - cur is.  */
+  do
+    {
+      cpu_relax ();
+      cur = __atomic_load_n (addr, MEMMODEL_RELAXED);
+      if (expected < cur)
+	return;
+    }
+  while (1);
+}
+
+#ifdef HAVE_ATTRIBUTE_VISIBILITY
+# pragma GCC visibility pop
+#endif
+
+#endif /* GOMP_DOACROSS_H */
diff --git a/libgomp/config/posix/affinity.c b/libgomp/config/posix/affinity.c
index 6840d3a727d..9008853c953 100644
--- a/libgomp/config/posix/affinity.c
+++ b/libgomp/config/posix/affinity.c
@@ -114,3 +114,27 @@ gomp_affinity_print_place (void *p)
 {
   (void) p;
 }
+
+int
+omp_get_place_num_procs (int place_num)
+{
+  (void) place_num;
+  return 0;
+}
+
+void
+omp_get_place_proc_ids (int place_num, int *ids)
+{
+  (void) place_num;
+  (void) ids;
+}
+
+void
+gomp_get_place_proc_ids_8 (int place_num, int64_t *ids)
+{
+  (void) place_num;
+  (void) ids;
+}
+
+ialias(omp_get_place_num_procs)
+ialias(omp_get_place_proc_ids)
diff --git a/libgomp/config/posix/doacross.h b/libgomp/config/posix/doacross.h
new file mode 100644
index 00000000000..537bcbba51c
--- /dev/null
+++ b/libgomp/config/posix/doacross.h
@@ -0,0 +1,62 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Jakub Jelinek <jakub@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This is a generic implementation of doacross spinning.  */
+
+#ifndef GOMP_DOACROSS_H
+#define GOMP_DOACROSS_H 1
+
+#include "libgomp.h"
+#include <errno.h>
+
+#ifdef HAVE_ATTRIBUTE_VISIBILITY
+# pragma GCC visibility push(hidden)
+#endif
+
+static inline void
+cpu_relax (void)
+{
+  __asm volatile ("" : : : "memory");
+}
+
+static inline void doacross_spin (unsigned long *addr, unsigned long expected,
+				  unsigned long cur)
+{
+  /* FIXME: back off depending on how large expected - cur is.  */
+  do
+    {
+      cpu_relax ();
+      cur = __atomic_load_n (addr, MEMMODEL_RELAXED);
+      if (expected < cur)
+	return;
+    }
+  while (1);
+}
+
+#ifdef HAVE_ATTRIBUTE_VISIBILITY
+# pragma GCC visibility pop
+#endif
+
+#endif /* GOMP_DOACROSS_H */
diff --git a/libgomp/env.c b/libgomp/env.c
index 6b5e963c4ea..5d6cdcf0184 100644
--- a/libgomp/env.c
+++ b/libgomp/env.c
@@ -29,6 +29,7 @@
 #include "libgomp.h"
 #include "libgomp_f.h"
 #include "oacc-int.h"
+#include "gomp-constants.h"
 #include <ctype.h>
 #include <stdlib.h>
 #include <stdio.h>
@@ -58,7 +59,7 @@ struct gomp_task_icv gomp_global_icv = {
   .nthreads_var = 1,
   .thread_limit_var = UINT_MAX,
   .run_sched_var = GFS_DYNAMIC,
-  .run_sched_modifier = 1,
+  .run_sched_chunk_size = 1,
   .default_device_var = 0,
   .dyn_var = false,
   .nest_var = false,
@@ -68,6 +69,7 @@ struct gomp_task_icv gomp_global_icv = {
 
 unsigned long gomp_max_active_levels_var = INT_MAX;
 bool gomp_cancel_var = false;
+int gomp_max_task_priority_var = 0;
 #ifndef HAVE_SYNC_BUILTINS
 gomp_mutex_t gomp_managed_threads_lock;
 #endif
@@ -123,7 +125,7 @@ parse_schedule (void)
     ++env;
   if (*env == '\0')
     {
-      gomp_global_icv.run_sched_modifier
+      gomp_global_icv.run_sched_chunk_size
 	= gomp_global_icv.run_sched_var != GFS_STATIC;
       return;
     }
@@ -149,7 +151,7 @@ parse_schedule (void)
 
   if (value == 0 && gomp_global_icv.run_sched_var != GFS_STATIC)
     value = 1;
-  gomp_global_icv.run_sched_modifier = value;
+  gomp_global_icv.run_sched_chunk_size = value;
   return;
 
  unknown:
@@ -1069,7 +1071,7 @@ handle_omp_display_env (unsigned long stacksize, int wait_policy)
 
   fputs ("\nOPENMP DISPLAY ENVIRONMENT BEGIN\n", stderr);
 
-  fputs ("  _OPENMP = '201307'\n", stderr);
+  fputs ("  _OPENMP = '201511'\n", stderr);
   fprintf (stderr, "  OMP_DYNAMIC = '%s'\n",
 	   gomp_global_icv.dyn_var ? "TRUE" : "FALSE");
   fprintf (stderr, "  OMP_NESTED = '%s'\n",
@@ -1157,6 +1159,8 @@ handle_omp_display_env (unsigned long stacksize, int wait_policy)
 	   gomp_cancel_var ? "TRUE" : "FALSE");
   fprintf (stderr, "  OMP_DEFAULT_DEVICE = '%d'\n",
 	   gomp_global_icv.default_device_var);
+  fprintf (stderr, "  OMP_MAX_TASK_PRIORITY = '%d'\n",
+	   gomp_max_task_priority_var);
 
   if (verbose)
     {
@@ -1189,6 +1193,7 @@ initialize_env (void)
   parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var);
   parse_boolean ("OMP_CANCELLATION", &gomp_cancel_var);
   parse_int ("OMP_DEFAULT_DEVICE", &gomp_global_icv.default_device_var, true);
+  parse_int ("OMP_MAX_TASK_PRIORITY", &gomp_max_task_priority_var, true);
   parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS", &gomp_max_active_levels_var,
 		       true);
   if (parse_unsigned_long ("OMP_THREAD_LIMIT", &thread_limit_var, false))
@@ -1337,21 +1342,21 @@ omp_get_nested (void)
 }
 
 void
-omp_set_schedule (omp_sched_t kind, int modifier)
+omp_set_schedule (omp_sched_t kind, int chunk_size)
 {
   struct gomp_task_icv *icv = gomp_icv (true);
   switch (kind)
     {
     case omp_sched_static:
-      if (modifier < 1)
-	modifier = 0;
-      icv->run_sched_modifier = modifier;
+      if (chunk_size < 1)
+	chunk_size = 0;
+      icv->run_sched_chunk_size = chunk_size;
       break;
     case omp_sched_dynamic:
     case omp_sched_guided:
-      if (modifier < 1)
-	modifier = 1;
-      icv->run_sched_modifier = modifier;
+      if (chunk_size < 1)
+	chunk_size = 1;
+      icv->run_sched_chunk_size = chunk_size;
       break;
     case omp_sched_auto:
       break;
@@ -1362,11 +1367,11 @@ omp_set_schedule (omp_sched_t kind, int modifier)
 }
 
 void
-omp_get_schedule (omp_sched_t *kind, int *modifier)
+omp_get_schedule (omp_sched_t *kind, int *chunk_size)
 {
   struct gomp_task_icv *icv = gomp_icv (false);
   *kind = icv->run_sched_var;
-  *modifier = icv->run_sched_modifier;
+  *chunk_size = icv->run_sched_chunk_size;
 }
 
 int
@@ -1402,6 +1407,12 @@ omp_get_cancellation (void)
   return gomp_cancel_var;
 }
 
+int
+omp_get_max_task_priority (void)
+{
+  return gomp_max_task_priority_var;
+}
+
 omp_proc_bind_t
 omp_get_proc_bind (void)
 {
@@ -1450,6 +1461,59 @@ omp_is_initial_device (void)
   return 1;
 }
 
+int
+omp_get_initial_device (void)
+{
+  return GOMP_DEVICE_HOST_FALLBACK;
+}
+
+int
+omp_get_num_places (void)
+{
+  return gomp_places_list_len;
+}
+
+int
+omp_get_place_num (void)
+{
+  if (gomp_places_list == NULL)
+    return -1;
+
+  struct gomp_thread *thr = gomp_thread ();
+  if (thr->place == 0)
+    gomp_init_affinity ();
+
+  return (int) thr->place - 1;
+}
+
+int
+omp_get_partition_num_places (void)
+{
+  if (gomp_places_list == NULL)
+    return 0;
+
+  struct gomp_thread *thr = gomp_thread ();
+  if (thr->place == 0)
+    gomp_init_affinity ();
+
+  return thr->ts.place_partition_len;
+}
+
+void
+omp_get_partition_place_nums (int *place_nums)
+{
+  if (gomp_places_list == NULL)
+    return;
+
+  struct gomp_thread *thr = gomp_thread ();
+  if (thr->place == 0)
+    gomp_init_affinity ();
+
+  unsigned int i;
+  for (i = 0; i < thr->ts.place_partition_len; i++)
+    *place_nums++ = thr->ts.place_partition_off + i;
+}
+
 ialias (omp_set_dynamic)
 ialias (omp_set_nested)
 ialias (omp_set_num_threads)
@@ -1469,3 +1533,9 @@ ialias (omp_get_num_devices)
 ialias (omp_get_num_teams)
 ialias (omp_get_team_num)
 ialias (omp_is_initial_device)
+ialias (omp_get_initial_device)
+ialias (omp_get_max_task_priority)
+ialias (omp_get_num_places)
+ialias (omp_get_place_num)
+ialias (omp_get_partition_num_places)
+ialias (omp_get_partition_place_nums)
diff --git a/libgomp/fortran.c b/libgomp/fortran.c
index 993145f8890..ceff9ac48e6 100644
--- a/libgomp/fortran.c
+++ b/libgomp/fortran.c
@@ -68,12 +68,20 @@ ialias_redirect (omp_get_active_level)
 ialias_redirect (omp_in_final)
 ialias_redirect (omp_get_cancellation)
 ialias_redirect (omp_get_proc_bind)
+ialias_redirect (omp_get_num_places)
+ialias_redirect (omp_get_place_num_procs)
+ialias_redirect (omp_get_place_proc_ids)
+ialias_redirect (omp_get_place_num)
+ialias_redirect (omp_get_partition_num_places)
+ialias_redirect (omp_get_partition_place_nums)
 ialias_redirect (omp_set_default_device)
 ialias_redirect (omp_get_default_device)
 ialias_redirect (omp_get_num_devices)
 ialias_redirect (omp_get_num_teams)
 ialias_redirect (omp_get_team_num)
 ialias_redirect (omp_is_initial_device)
+ialias_redirect (omp_get_initial_device)
+ialias_redirect (omp_get_max_task_priority)
 #endif
 
 #ifndef LIBGOMP_GNU_SYMBOL_VERSIONING
@@ -343,35 +351,35 @@ omp_get_wtime_ (void)
 }
 
 void
-omp_set_schedule_ (const int32_t *kind, const int32_t *modifier)
+omp_set_schedule_ (const int32_t *kind, const int32_t *chunk_size)
 {
-  omp_set_schedule (*kind, *modifier);
+  omp_set_schedule (*kind, *chunk_size);
 }
 
 void
-omp_set_schedule_8_ (const int32_t *kind, const int64_t *modifier)
+omp_set_schedule_8_ (const int32_t *kind, const int64_t *chunk_size)
 {
-  omp_set_schedule (*kind, TO_INT (*modifier));
+  omp_set_schedule (*kind, TO_INT (*chunk_size));
 }
 
 void
-omp_get_schedule_ (int32_t *kind, int32_t *modifier)
+omp_get_schedule_ (int32_t *kind, int32_t *chunk_size)
 {
   omp_sched_t k;
-  int m;
-  omp_get_schedule (&k, &m);
+  int cs;
+  omp_get_schedule (&k, &cs);
   *kind = k;
-  *modifier = m;
+  *chunk_size = cs;
 }
 
 void
-omp_get_schedule_8_ (int32_t *kind, int64_t *modifier)
+omp_get_schedule_8_ (int32_t *kind, int64_t *chunk_size)
 {
   omp_sched_t k;
-  int m;
-  omp_get_schedule (&k, &m);
+  int cs;
+  omp_get_schedule (&k, &cs);
   *kind = k;
-  *modifier = m;
+  *chunk_size = cs;
 }
 
 int32_t
@@ -452,6 +460,69 @@ omp_get_proc_bind_ (void)
   return omp_get_proc_bind ();
 }
 
+int32_t
+omp_get_num_places_ (void)
+{
+  return omp_get_num_places ();
+}
+
+int32_t
+omp_get_place_num_procs_ (const int32_t *place_num)
+{
+  return omp_get_place_num_procs (*place_num);
+}
+
+int32_t
+omp_get_place_num_procs_8_ (const int64_t *place_num)
+{
+  return omp_get_place_num_procs (TO_INT (*place_num));
+}
+
+void
+omp_get_place_proc_ids_ (const int32_t *place_num, int32_t *ids)
+{
+  omp_get_place_proc_ids (*place_num, ids);
+}
+
+void
+omp_get_place_proc_ids_8_ (const int64_t *place_num, int64_t *ids)
+{
+  gomp_get_place_proc_ids_8 (TO_INT (*place_num), ids);
+}
+
+int32_t
+omp_get_place_num_ (void)
+{
+  return omp_get_place_num ();
+}
+
+int32_t
+omp_get_partition_num_places_ (void)
+{
+  return omp_get_partition_num_places ();
+}
+
+void
+omp_get_partition_place_nums_ (int32_t *place_nums)
+{
+  omp_get_partition_place_nums (place_nums);
+}
+
+void
+omp_get_partition_place_nums_8_ (int64_t *place_nums)
+{
+  if (gomp_places_list == NULL)
+    return;
+
+  struct gomp_thread *thr = gomp_thread ();
+  if (thr->place == 0)
+    gomp_init_affinity ();
+
+  unsigned int i;
+  for (i = 0; i < thr->ts.place_partition_len; i++)
+    *place_nums++ = (int64_t) thr->ts.place_partition_off + i;
+}
+
 void
 omp_set_default_device_ (const int32_t *device_num)
 {
@@ -493,3 +564,15 @@ omp_is_initial_device_ (void)
 {
   return omp_is_initial_device ();
 }
+
+int32_t
+omp_get_initial_device_ (void)
+{
+  return omp_get_initial_device ();
+}
+
+int32_t
+omp_get_max_task_priority_ (void)
+{
+  return omp_get_max_task_priority ();
+}
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 04262c4ab28..9c8b1fb8744 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -36,6 +36,11 @@
 #ifndef LIBGOMP_H 
 #define LIBGOMP_H 1
 
+#ifndef _LIBGOMP_CHECKING_
+/* Define to 1 to perform internal sanity checks.  */
+#define _LIBGOMP_CHECKING_ 0
+#endif
+
 #include "config.h"
 #include "gstdint.h"
 #include "libgomp-plugin.h"
@@ -78,6 +83,44 @@ enum gomp_schedule_type
   GFS_AUTO
 };
 
+struct gomp_doacross_work_share
+{
+  union {
+    /* chunk_size copy, as ws->chunk_size is multiplied by incr for
+       GFS_DYNAMIC.  */
+    long chunk_size;
+    /* Likewise, but for ull implementation.  */
+    unsigned long long chunk_size_ull;
+    /* For schedule(static,0) this is the number
+       of iterations assigned to the last thread, i.e. number of
+       iterations / number of threads.  */
+    long q;
+    /* Likewise, but for ull implementation.  */
+    unsigned long long q_ull;
+  };
+  /* Size of each array entry (padded to cache line size).  */
+  unsigned long elt_sz;
+  /* Number of dimensions in sink vectors.  */
+  unsigned int ncounts;
+  /* True if the iterations can be flattened.  */
+  bool flattened;
+  /* Actual array (of elt_sz sized units), aligned to cache line size.
+     This is indexed by team_id for GFS_STATIC and outermost iteration
+     / chunk_size for other schedules.  */
+  unsigned char *array;
+  /* These two are only used for schedule(static,0).  */
+  /* This one is number of iterations % number of threads.  */
+  long t;
+  union {
+    /* And this one is cached t * (q + 1).  */
+    long boundary;
+    /* Likewise, but for the ull implementation.  */
+    unsigned long long boundary_ull;
+  };
+  /* Array of shift counts for each dimension if they can be flattened.  */
+  unsigned int shift_counts[];
+};
+
 struct gomp_work_share
 {
   /* This member records the SCHEDULE clause to be used for this construct.
@@ -109,13 +152,18 @@ struct gomp_work_share
     };
   };
 
-  /* This is a circular queue that details which threads will be allowed
-     into the ordered region and in which order.  When a thread allocates
-     iterations on which it is going to work, it also registers itself at
-     the end of the array.  When a thread reaches the ordered region, it
-     checks to see if it is the one at the head of the queue.  If not, it
-     blocks on its RELEASE semaphore.  */
-  unsigned *ordered_team_ids;
+  union {
+    /* This is a circular queue that details which threads will be allowed
+       into the ordered region and in which order.  When a thread allocates
+       iterations on which it is going to work, it also registers itself at
+       the end of the array.  When a thread reaches the ordered region, it
+       checks to see if it is the one at the head of the queue.  If not, it
+       blocks on its RELEASE semaphore.  */
+    unsigned *ordered_team_ids;
+
+    /* This is a pointer to DOACROSS work share data.  */
+    struct gomp_doacross_work_share *doacross;
+  };
 
   /* This is the number of threads that have registered themselves in
      the circular queue ordered_team_ids.  */
@@ -234,7 +282,7 @@ struct gomp_task_icv
 {
   unsigned long nthreads_var;
   enum gomp_schedule_type run_sched_var;
-  int run_sched_modifier;
+  int run_sched_chunk_size;
   int default_device_var;
   unsigned int thread_limit_var;
   bool dyn_var;
@@ -263,9 +311,13 @@ extern char *goacc_device_type;
 
 enum gomp_task_kind
 {
+  /* Implicit task.  */
   GOMP_TASK_IMPLICIT,
-  GOMP_TASK_IFFALSE,
+  /* Undeferred task.  */
+  GOMP_TASK_UNDEFERRED,
+  /* Task created by GOMP_task and waiting to be run.  */
   GOMP_TASK_WAITING,
+  /* Task currently executing or scheduled and about to execute.  */
   GOMP_TASK_TIED
 };
 
@@ -275,10 +327,13 @@ struct htab;
 
 struct gomp_task_depend_entry
 {
+  /* Address of dependency.  */
   void *addr;
   struct gomp_task_depend_entry *next;
   struct gomp_task_depend_entry *prev;
+  /* Task that provides the dependency in ADDR.  */
   struct gomp_task *task;
+  /* Depend entry is of type "IN".  */
   bool is_in;
   bool redundant;
   bool redundant_out;
@@ -306,19 +361,35 @@ struct gomp_taskwait
 
 struct gomp_task
 {
+  /* Parent circular list.  See children description below.  */
   struct gomp_task *parent;
+  /* Circular list representing the children of this task.
+
+     In this list we first have parent_depends_on ready to run tasks,
+     then !parent_depends_on ready to run tasks, and finally already
+     running tasks.  */
   struct gomp_task *children;
   struct gomp_task *next_child;
   struct gomp_task *prev_child;
+  /* Circular task_queue in `struct gomp_team'.
+
+     GOMP_TASK_WAITING tasks come before GOMP_TASK_TIED tasks.  */
   struct gomp_task *next_queue;
   struct gomp_task *prev_queue;
+  /* Circular queue in gomp_taskgroup->children.
+
+     GOMP_TASK_WAITING tasks come before GOMP_TASK_TIED tasks.  */
   struct gomp_task *next_taskgroup;
   struct gomp_task *prev_taskgroup;
+  /* Taskgroup this task belongs in.  */
   struct gomp_taskgroup *taskgroup;
+  /* Tasks that depend on this task.  */
   struct gomp_dependers_vec *dependers;
   struct htab *depend_hash;
   struct gomp_taskwait *taskwait;
+  /* Number of items in DEPEND.  */
   size_t depend_count;
+  /* Number of tasks in the DEPENDERS field above.  */
   size_t num_dependees;
   struct gomp_task_icv icv;
   void (*fn) (void *);
@@ -327,13 +398,23 @@ struct gomp_task
   bool in_tied_task;
   bool final_task;
   bool copy_ctors_done;
+  /* Set for undeferred tasks with unsatisfied dependencies which
+     block further execution of their parent until the dependencies
+     are satisfied.  */
   bool parent_depends_on;
+  /* Dependencies provided and/or needed for this task.  DEPEND_COUNT
+     is the number of items available.  */
   struct gomp_task_depend_entry depend[];
 };
 
 struct gomp_taskgroup
 {
   struct gomp_taskgroup *prev;
+  /* Circular list of tasks that belong in this taskgroup.
+
+     Tasks are chained by next/prev_taskgroup within gomp_task, and
+     are sorted by GOMP_TASK_WAITING tasks, and then GOMP_TASK_TIED
+     tasks.  */
   struct gomp_task *children;
   bool in_taskgroup_wait;
   bool cancelled;
@@ -341,6 +422,17 @@ struct gomp_taskgroup
   size_t num_children;
 };
 
+struct gomp_target_task
+{
+  struct gomp_device_descr *devicep;
+  void (*fn) (void *);
+  size_t mapnum;
+  size_t *sizes;
+  unsigned short *kinds;
+  unsigned int flags;
+  void *hostaddrs[];
+};
+
 /* This structure describes a "team" of threads.  These are the threads
    that are spawned by a PARALLEL constructs, as well as the work sharing
    constructs that the team encounters.  */
@@ -403,6 +495,8 @@ struct gomp_team
   struct gomp_work_share work_shares[8];
 
   gomp_mutex_t task_lock;
+  /* Scheduled tasks.  Chain fields are next/prev_queue within a
+     gomp_task.  */
   struct gomp_task *task_queue;
   /* Number of all GOMP_TASK_{WAITING,TIED} tasks in the team.  */
   unsigned int task_count;
@@ -531,6 +625,7 @@ extern bool gomp_affinity_same_place (void *, void *);
 extern bool gomp_affinity_finalize_place_list (bool);
 extern bool gomp_affinity_init_level (int, unsigned long, bool);
 extern void gomp_affinity_print_place (void *);
+extern void gomp_get_place_proc_ids_8 (int, int64_t *);
 
 /* alloc.c */
 
@@ -600,6 +695,9 @@ extern void gomp_ordered_next (void);
 extern void gomp_ordered_static_init (void);
 extern void gomp_ordered_static_next (void);
 extern void gomp_ordered_sync (void);
+extern void gomp_doacross_init (unsigned, long *, long);
+extern void gomp_doacross_ull_init (unsigned, unsigned long long *,
+				    unsigned long long);
 
 /* parallel.c */
 
@@ -616,6 +714,11 @@ extern void gomp_init_task (struct gomp_task *, struct gomp_task *,
 			    struct gomp_task_icv *);
 extern void gomp_end_task (void);
 extern void gomp_barrier_handle_tasks (gomp_barrier_state_t);
+extern void gomp_task_maybe_wait_for_dependencies (void **);
+extern void gomp_create_target_task (struct gomp_device_descr *,
+				     void (*) (void *), size_t, void **,
+				     size_t *, unsigned short *, unsigned int,
+				     void **);
 
 static void inline
 gomp_finish_task (struct gomp_task *task)
@@ -636,11 +739,25 @@ extern void gomp_free_thread (void *);
 
 extern void gomp_init_targets_once (void);
 extern int gomp_get_num_devices (void);
+extern void gomp_target_task_fn (void *);
 
 typedef struct splay_tree_node_s *splay_tree_node;
 typedef struct splay_tree_s *splay_tree;
 typedef struct splay_tree_key_s *splay_tree_key;
 
+struct target_var_desc {
+  /* Splay key.  */
+  splay_tree_key key;
+  /* True if data should be copied from device to host at the end.  */
+  bool copy_from;
+  /* True if data always should be copied from device to host at the end.  */
+  bool always_copy_from;
+  /* Relative offset against key host_start.  */
+  uintptr_t offset;
+  /* Actual length.  */
+  uintptr_t length;
+};
+
 struct target_mem_desc {
   /* Reference count.  */
   uintptr_t refcount;
@@ -660,11 +777,14 @@ struct target_mem_desc {
   /* Corresponding target device descriptor.  */
   struct gomp_device_descr *device_descr;
 
-  /* List of splay keys to remove (or decrease refcount)
+  /* List of target items to remove (or decrease refcount)
      at the end of region.  */
-  splay_tree_key list[];
+  struct target_var_desc list[];
 };
 
+/* Special value for refcount - infinity.  */
+#define REFCOUNT_INFINITY (~(uintptr_t) 0)
+
 struct splay_tree_key_s {
   /* Address of the host object.  */
   uintptr_t host_start;
@@ -678,8 +798,6 @@ struct splay_tree_key_s {
   uintptr_t refcount;
   /* Asynchronous reference count.  */
   uintptr_t async_refcount;
-  /* True if data should be copied from device to host at the end.  */
-  bool copy_from;
 };
 
 #include "splay-tree.h"
@@ -757,6 +875,7 @@ struct gomp_device_descr
   void (*free_func) (int, void *);
   void *(*dev2host_func) (int, void *, const void *, size_t);
   void *(*host2dev_func) (int, void *, const void *, size_t);
+  void *(*dev2dev_func) (int, void *, const void *, size_t);
   void (*run_func) (int, void *, void *);
 
   /* Splay tree containing information about mapped memory regions.  */
@@ -774,12 +893,22 @@ struct gomp_device_descr
   acc_dispatch_t openacc;
 };
 
+/* Kind of the pragma, for which gomp_map_vars () is called.  */
+enum gomp_map_vars_kind
+{
+  GOMP_MAP_VARS_OPENACC,
+  GOMP_MAP_VARS_TARGET,
+  GOMP_MAP_VARS_DATA,
+  GOMP_MAP_VARS_ENTER_DATA
+};
+
 extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *);
 extern void gomp_acc_remove_pointer (void *, bool, int, int);
 
 extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
 					      size_t, void **, void **,
-					      size_t *, void *, bool, bool);
+					      size_t *, void *, bool,
+					      enum gomp_map_vars_kind);
 extern void gomp_copy_from_async (struct target_mem_desc *);
 extern void gomp_unmap_vars (struct target_mem_desc *, bool);
 extern void gomp_init_device (struct gomp_device_descr *);
diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index 3b3e0c2ac73..2153661ed5a 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -134,6 +134,36 @@ OMP_4.0 {
 	omp_is_initial_device_;
 } OMP_3.1;
 
+OMP_4.5 {
+  global:
+	omp_get_max_task_priority;
+	omp_get_max_task_priority_;
+	omp_get_num_places;
+	omp_get_num_places_;
+	omp_get_place_num_procs;
+	omp_get_place_num_procs_;
+	omp_get_place_num_procs_8_;
+	omp_get_place_proc_ids;
+	omp_get_place_proc_ids_;
+	omp_get_place_proc_ids_8_;
+	omp_get_place_num;
+	omp_get_place_num_;
+	omp_get_partition_num_places;
+	omp_get_partition_num_places_;
+	omp_get_partition_place_nums;
+	omp_get_partition_place_nums_;
+	omp_get_partition_place_nums_8_;
+	omp_get_initial_device;
+	omp_get_initial_device_;
+	omp_target_alloc;
+	omp_target_free;
+	omp_target_is_present;
+	omp_target_memcpy;
+	omp_target_memcpy_rect;
+	omp_target_associate_ptr;
+	omp_target_disassociate_ptr;
+} OMP_4.0;
+
 GOMP_1.0 {
   global:
 	GOMP_atomic_end;
@@ -234,10 +264,28 @@ GOMP_4.0.1 {
 	GOMP_offload_unregister;
 } GOMP_4.0;
 
-GOMP_4.0.2 {
+GOMP_4.5 {
   global:
+	GOMP_target_41;
+	GOMP_target_data_41;
+	GOMP_target_update_41;
+	GOMP_target_enter_exit_data;
+	GOMP_taskloop;
+	GOMP_taskloop_ull;
 	GOMP_offload_register_ver;
 	GOMP_offload_unregister_ver;
+	GOMP_loop_doacross_dynamic_start;
+	GOMP_loop_doacross_guided_start;
+	GOMP_loop_doacross_runtime_start;
+	GOMP_loop_doacross_static_start;
+	GOMP_doacross_post;
+	GOMP_doacross_wait;
+	GOMP_loop_ull_doacross_dynamic_start;
+	GOMP_loop_ull_doacross_guided_start;
+	GOMP_loop_ull_doacross_runtime_start;
+	GOMP_loop_ull_doacross_static_start;
+	GOMP_doacross_ull_post;
+	GOMP_doacross_ull_wait;
 } GOMP_4.0.1;
 
 OACC_2.0 {
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index 06b1c67fc02..67e6d199066 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -155,6 +155,7 @@ linkage, and do not throw exceptions.
 * omp_get_dynamic::             Dynamic teams setting
 * omp_get_level::               Number of parallel regions
 * omp_get_max_active_levels::   Maximum number of active regions
+* omp_get_max_task_priority::   Maximum task priority value that can be set
 * omp_get_max_threads::         Maximum number of threads of parallel region
 * omp_get_nested::              Nested parallel regions
 * omp_get_num_devices::         Number of target devices
@@ -388,6 +389,27 @@ This function obtains the maximum allowed number of nested, active parallel regi
 @end table
 
 
+@node omp_get_max_task_priority
+@section @code{omp_get_max_task_priority} -- Maximum priority value
+that can be set for tasks.
+@table @asis
+@item @emph{Description}:
+This function obtains the maximum allowed priority number for tasks.
+
+@item @emph{C/C++}
+@multitable @columnfractions .20 .80
+@item @emph{Prototype}: @tab @code{int omp_get_max_task_priority(void);}
+@end multitable
+
+@item @emph{Fortran}:
+@multitable @columnfractions .20 .80
+@item @emph{Interface}: @tab @code{integer function omp_get_max_task_priority()}
+@end multitable
+
+@item @emph{Reference}:
+@uref{http://www.openmp.org/, OpenMP specification v4.5}, Section 3.2.29.
+@end table
+
 
 @node omp_get_max_threads
 @section @code{omp_get_max_threads} -- Maximum number of threads of parallel region
@@ -581,18 +603,18 @@ set via @env{OMP_PROC_BIND}.  Possible values are @code{omp_proc_bind_false},
 Obtain the runtime scheduling method.  The @var{kind} argument will be
 set to the value @code{omp_sched_static}, @code{omp_sched_dynamic},
 @code{omp_sched_guided} or @code{omp_sched_auto}.  The second argument,
-@var{modifier}, is set to the chunk size.
+@var{chunk_size}, is set to the chunk size.
 
 @item @emph{C/C++}
 @multitable @columnfractions .20 .80
-@item @emph{Prototype}: @tab @code{void omp_get_schedule(omp_sched_t *kind, int *modifier);}
+@item @emph{Prototype}: @tab @code{void omp_get_schedule(omp_sched_t *kind, int *chunk_size);}
 @end multitable
 
 @item @emph{Fortran}:
 @multitable @columnfractions .20 .80
-@item @emph{Interface}: @tab @code{subroutine omp_get_schedule(kind, modifier)}
+@item @emph{Interface}: @tab @code{subroutine omp_get_schedule(kind, chunk_size)}
 @item                   @tab @code{integer(kind=omp_sched_kind) kind}
-@item                   @tab @code{integer modifier}
+@item                   @tab @code{integer chunk_size}
 @end multitable
 
 @item @emph{See also}:
@@ -929,19 +951,19 @@ Sets the runtime scheduling method.  The @var{kind} argument can have the
 value @code{omp_sched_static}, @code{omp_sched_dynamic},
 @code{omp_sched_guided} or @code{omp_sched_auto}.  Except for
 @code{omp_sched_auto}, the chunk size is set to the value of
-@var{modifier} if positive, or to the default value if zero or negative.
-For @code{omp_sched_auto} the @var{modifier} argument is ignored.
+@var{chunk_size} if positive, or to the default value if zero or negative.
+For @code{omp_sched_auto} the @var{chunk_size} argument is ignored.
 
 @item @emph{C/C++}
 @multitable @columnfractions .20 .80
-@item @emph{Prototype}: @tab @code{void omp_set_schedule(omp_sched_t kind, int modifier);}
+@item @emph{Prototype}: @tab @code{void omp_set_schedule(omp_sched_t kind, int chunk_size);}
 @end multitable
 
 @item @emph{Fortran}:
 @multitable @columnfractions .20 .80
-@item @emph{Interface}: @tab @code{subroutine omp_set_schedule(kind, modifier)}
+@item @emph{Interface}: @tab @code{subroutine omp_set_schedule(kind, chunk_size)}
 @item                   @tab @code{integer(kind=omp_sched_kind) kind}
-@item                   @tab @code{integer modifier}
+@item                   @tab @code{integer chunk_size}
 @end multitable
 
 @item @emph{See also}:
@@ -1311,6 +1333,7 @@ beginning with @env{GOMP_} are GNU extensions.
 * OMP_DEFAULT_DEVICE::      Set the device used in target regions
 * OMP_DYNAMIC::             Dynamic adjustment of threads
 * OMP_MAX_ACTIVE_LEVELS::   Set the maximum number of nested parallel regions
+* OMP_MAX_TASK_PRIORITY::   Set the maximum task priority value
 * OMP_NESTED::              Nested parallel regions
 * OMP_NUM_THREADS::         Specifies the number of threads to use
 * OMP_PROC_BIND::           Whether theads may be moved between CPUs
@@ -1420,6 +1443,26 @@ If undefined, the number of active levels is unlimited.
 
 
 
+@node OMP_MAX_TASK_PRIORITY
+@section @env{OMP_MAX_TASK_PRIORITY} -- Set the maximum priority
+number that can be set for a task.
+@cindex Environment Variable
+@table @asis
+@item @emph{Description}:
+Specifies the initial value for the maximum priority value that can be
+set for a task.  The value of this variable shall be a non-negative
+integer, and zero is allowed.  If undefined, the default priority is
+0.
+
+@item @emph{See also}:
+@ref{omp_get_max_task_priority}
+
+@item @emph{Reference}: 
+@uref{http://www.openmp.org/, OpenMP specification v4.5}, Section 4.14
+@end table
+
+
+
 @node OMP_NESTED
 @section @env{OMP_NESTED} -- Nested parallel regions
 @cindex Environment Variable
diff --git a/libgomp/libgomp_g.h b/libgomp/libgomp_g.h
index e7f4effaf48..c28ad2116dc 100644
--- a/libgomp/libgomp_g.h
+++ b/libgomp/libgomp_g.h
@@ -71,6 +71,15 @@ extern bool GOMP_loop_ordered_dynamic_next (long *, long *);
 extern bool GOMP_loop_ordered_guided_next (long *, long *);
 extern bool GOMP_loop_ordered_runtime_next (long *, long *);
 
+extern bool GOMP_loop_doacross_static_start (unsigned, long *, long, long *,
+					     long *);
+extern bool GOMP_loop_doacross_dynamic_start (unsigned, long *, long, long *,
+					      long *);
+extern bool GOMP_loop_doacross_guided_start (unsigned, long *, long, long *,
+					     long *);
+extern bool GOMP_loop_doacross_runtime_start (unsigned, long *, long *,
+					      long *);
+
 extern void GOMP_parallel_loop_static_start (void (*)(void *), void *,
 					     unsigned, long, long, long, long);
 extern void GOMP_parallel_loop_dynamic_start (void (*)(void *), void *,
@@ -164,10 +173,34 @@ extern bool GOMP_loop_ull_ordered_guided_next (unsigned long long *,
 extern bool GOMP_loop_ull_ordered_runtime_next (unsigned long long *,
 						unsigned long long *);
 
+extern bool GOMP_loop_ull_doacross_static_start (unsigned,
+						 unsigned long long *,
+						 unsigned long long,
+						 unsigned long long *,
+						 unsigned long long *);
+extern bool GOMP_loop_ull_doacross_dynamic_start (unsigned,
+						  unsigned long long *,
+						  unsigned long long,
+						  unsigned long long *,
+						  unsigned long long *);
+extern bool GOMP_loop_ull_doacross_guided_start (unsigned,
+						 unsigned long long *,
+						 unsigned long long,
+						 unsigned long long *,
+						 unsigned long long *);
+extern bool GOMP_loop_ull_doacross_runtime_start (unsigned,
+						  unsigned long long *,
+						  unsigned long long *,
+						  unsigned long long *);
+
 /* ordered.c */
 
 extern void GOMP_ordered_start (void);
 extern void GOMP_ordered_end (void);
+extern void GOMP_doacross_post (long *);
+extern void GOMP_doacross_wait (long, ...);
+extern void GOMP_doacross_ull_post (unsigned long long *);
+extern void GOMP_doacross_ull_wait (unsigned long long, ...);
 
 /* parallel.c */
 
@@ -180,7 +213,15 @@ extern bool GOMP_cancellation_point (int);
 /* task.c */
 
 extern void GOMP_task (void (*) (void *), void *, void (*) (void *, void *),
-		       long, long, bool, unsigned, void **);
+		       long, long, bool, unsigned, void **, int);
+extern void GOMP_taskloop (void (*) (void *), void *,
+			   void (*) (void *, void *), long, long, unsigned,
+			   unsigned long, int, long, long, long);
+extern void GOMP_taskloop_ull (void (*) (void *), void *,
+			       void (*) (void *, void *), long, long,
+			       unsigned, unsigned long, int,
+			       unsigned long long, unsigned long long,
+			       unsigned long long);
 extern void GOMP_taskwait (void);
 extern void GOMP_taskyield (void);
 extern void GOMP_taskgroup_start (void);
@@ -208,11 +249,20 @@ extern void GOMP_single_copy_end (void *);
 
 extern void GOMP_target (int, void (*) (void *), const void *,
 			 size_t, void **, size_t *, unsigned char *);
+extern void GOMP_target_41 (int, void (*) (void *), size_t, void **, size_t *,
+			  unsigned short *, unsigned int, void **);
 extern void GOMP_target_data (int, const void *,
 			      size_t, void **, size_t *, unsigned char *);
+extern void GOMP_target_data_41 (int, size_t, void **, size_t *,
+			       unsigned short *);
 extern void GOMP_target_end_data (void);
 extern void GOMP_target_update (int, const void *,
 				size_t, void **, size_t *, unsigned char *);
+extern void GOMP_target_update_41 (int, size_t, void **, size_t *,
+				   unsigned short *, unsigned int, void **);
+extern void GOMP_target_enter_exit_data (int, size_t, void **, size_t *,
+					 unsigned short *, unsigned int,
+					 void **);
 extern void GOMP_teams (unsigned int, unsigned int);
 
 /* oacc-parallel.c */
diff --git a/libgomp/loop.c b/libgomp/loop.c
index 27d78db7a56..812f66cd725 100644
--- a/libgomp/loop.c
+++ b/libgomp/loop.c
@@ -169,13 +169,16 @@ GOMP_loop_runtime_start (long start, long end, long incr,
   switch (icv->run_sched_var)
     {
     case GFS_STATIC:
-      return gomp_loop_static_start (start, end, incr, icv->run_sched_modifier,
+      return gomp_loop_static_start (start, end, incr,
+				     icv->run_sched_chunk_size,
 				     istart, iend);
     case GFS_DYNAMIC:
-      return gomp_loop_dynamic_start (start, end, incr, icv->run_sched_modifier,
+      return gomp_loop_dynamic_start (start, end, incr,
+				      icv->run_sched_chunk_size,
 				      istart, iend);
     case GFS_GUIDED:
-      return gomp_loop_guided_start (start, end, incr, icv->run_sched_modifier,
+      return gomp_loop_guided_start (start, end, incr,
+				     icv->run_sched_chunk_size,
 				     istart, iend);
     case GFS_AUTO:
       /* For now map to schedule(static), later on we could play with feedback
@@ -266,15 +269,15 @@ GOMP_loop_ordered_runtime_start (long start, long end, long incr,
     {
     case GFS_STATIC:
       return gomp_loop_ordered_static_start (start, end, incr,
-					     icv->run_sched_modifier,
+					     icv->run_sched_chunk_size,
 					     istart, iend);
     case GFS_DYNAMIC:
       return gomp_loop_ordered_dynamic_start (start, end, incr,
-					      icv->run_sched_modifier,
+					      icv->run_sched_chunk_size,
 					      istart, iend);
     case GFS_GUIDED:
       return gomp_loop_ordered_guided_start (start, end, incr,
-					     icv->run_sched_modifier,
+					     icv->run_sched_chunk_size,
 					     istart, iend);
     case GFS_AUTO:
       /* For now map to schedule(static), later on we could play with feedback
@@ -286,6 +289,111 @@ GOMP_loop_ordered_runtime_start (long start, long end, long incr,
     }
 }
 
+/* The *_doacross_*_start routines are similar.  The only difference is that
+   this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
+   section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
+   and other COUNTS array elements tell the library number of iterations
+   in the ordered inner loops.  */
+
+static bool
+gomp_loop_doacross_static_start (unsigned ncounts, long *counts,
+				 long chunk_size, long *istart, long *iend)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  thr->ts.static_trip = 0;
+  if (gomp_work_share_start (false))
+    {
+      gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
+		      GFS_STATIC, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size);
+      gomp_work_share_init_done ();
+    }
+
+  return !gomp_iter_static_next (istart, iend);
+}
+
+static bool
+gomp_loop_doacross_dynamic_start (unsigned ncounts, long *counts,
+				  long chunk_size, long *istart, long *iend)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  bool ret;
+
+  if (gomp_work_share_start (false))
+    {
+      gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
+		      GFS_DYNAMIC, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size);
+      gomp_work_share_init_done ();
+    }
+
+#ifdef HAVE_SYNC_BUILTINS
+  ret = gomp_iter_dynamic_next (istart, iend);
+#else
+  gomp_mutex_lock (&thr->ts.work_share->lock);
+  ret = gomp_iter_dynamic_next_locked (istart, iend);
+  gomp_mutex_unlock (&thr->ts.work_share->lock);
+#endif
+
+  return ret;
+}
+
+static bool
+gomp_loop_doacross_guided_start (unsigned ncounts, long *counts,
+				 long chunk_size, long *istart, long *iend)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  bool ret;
+
+  if (gomp_work_share_start (false))
+    {
+      gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
+		      GFS_GUIDED, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size);
+      gomp_work_share_init_done ();
+    }
+
+#ifdef HAVE_SYNC_BUILTINS
+  ret = gomp_iter_guided_next (istart, iend);
+#else
+  gomp_mutex_lock (&thr->ts.work_share->lock);
+  ret = gomp_iter_guided_next_locked (istart, iend);
+  gomp_mutex_unlock (&thr->ts.work_share->lock);
+#endif
+
+  return ret;
+}
+
+bool
+GOMP_loop_doacross_runtime_start (unsigned ncounts, long *counts,
+				  long *istart, long *iend)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  switch (icv->run_sched_var)
+    {
+    case GFS_STATIC:
+      return gomp_loop_doacross_static_start (ncounts, counts,
+					      icv->run_sched_chunk_size,
+					      istart, iend);
+    case GFS_DYNAMIC:
+      return gomp_loop_doacross_dynamic_start (ncounts, counts,
+					       icv->run_sched_chunk_size,
+					       istart, iend);
+    case GFS_GUIDED:
+      return gomp_loop_doacross_guided_start (ncounts, counts,
+					      icv->run_sched_chunk_size,
+					      istart, iend);
+    case GFS_AUTO:
+      /* For now map to schedule(static), later on we could play with feedback
+	 driven choice.  */
+      return gomp_loop_doacross_static_start (ncounts, counts,
+					      0, istart, iend);
+    default:
+      abort ();
+    }
+}
+
 /* The *_next routines are called when the thread completes processing of 
    the iteration block currently assigned to it.  If the work-share 
    construct is bound directly to a parallel construct, then the iteration
@@ -484,7 +592,7 @@ GOMP_parallel_loop_runtime_start (void (*fn) (void *), void *data,
 {
   struct gomp_task_icv *icv = gomp_icv (false);
   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
-			    icv->run_sched_var, icv->run_sched_modifier, 0);
+			    icv->run_sched_var, icv->run_sched_chunk_size, 0);
 }
 
 ialias_redirect (GOMP_parallel_end)
@@ -529,7 +637,7 @@ GOMP_parallel_loop_runtime (void (*fn) (void *), void *data,
 {
   struct gomp_task_icv *icv = gomp_icv (false);
   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
-			    icv->run_sched_var, icv->run_sched_modifier,
+			    icv->run_sched_var, icv->run_sched_chunk_size,
 			    flags);
   fn (data);
   GOMP_parallel_end ();
@@ -578,6 +686,13 @@ extern __typeof(gomp_loop_ordered_dynamic_start) GOMP_loop_ordered_dynamic_start
 extern __typeof(gomp_loop_ordered_guided_start) GOMP_loop_ordered_guided_start
 	__attribute__((alias ("gomp_loop_ordered_guided_start")));
 
+extern __typeof(gomp_loop_doacross_static_start) GOMP_loop_doacross_static_start
+	__attribute__((alias ("gomp_loop_doacross_static_start")));
+extern __typeof(gomp_loop_doacross_dynamic_start) GOMP_loop_doacross_dynamic_start
+	__attribute__((alias ("gomp_loop_doacross_dynamic_start")));
+extern __typeof(gomp_loop_doacross_guided_start) GOMP_loop_doacross_guided_start
+	__attribute__((alias ("gomp_loop_doacross_guided_start")));
+
 extern __typeof(gomp_loop_static_next) GOMP_loop_static_next
 	__attribute__((alias ("gomp_loop_static_next")));
 extern __typeof(gomp_loop_dynamic_next) GOMP_loop_dynamic_next
@@ -638,6 +753,30 @@ GOMP_loop_ordered_guided_start (long start, long end, long incr,
 }
 
 bool
+GOMP_loop_doacross_static_start (unsigned ncounts, long *counts,
+				 long chunk_size, long *istart, long *iend)
+{
+  return gomp_loop_doacross_static_start (ncounts, counts, chunk_size,
+					  istart, iend);
+}
+
+bool
+GOMP_loop_doacross_dynamic_start (unsigned ncounts, long *counts,
+				  long chunk_size, long *istart, long *iend)
+{
+  return gomp_loop_doacross_dynamic_start (ncounts, counts, chunk_size,
+					   istart, iend);
+}
+
+bool
+GOMP_loop_doacross_guided_start (unsigned ncounts, long *counts,
+				 long chunk_size, long *istart, long *iend)
+{
+  return gomp_loop_doacross_guided_start (ncounts, counts, chunk_size,
+					  istart, iend);
+}
+
+bool
 GOMP_loop_static_next (long *istart, long *iend)
 {
   return gomp_loop_static_next (istart, iend);
diff --git a/libgomp/loop_ull.c b/libgomp/loop_ull.c
index de56ae0b7ce..1f2ed546024 100644
--- a/libgomp/loop_ull.c
+++ b/libgomp/loop_ull.c
@@ -175,15 +175,15 @@ GOMP_loop_ull_runtime_start (bool up, gomp_ull start, gomp_ull end,
     {
     case GFS_STATIC:
       return gomp_loop_ull_static_start (up, start, end, incr,
-					 icv->run_sched_modifier,
+					 icv->run_sched_chunk_size,
 					 istart, iend);
     case GFS_DYNAMIC:
       return gomp_loop_ull_dynamic_start (up, start, end, incr,
-					  icv->run_sched_modifier,
+					  icv->run_sched_chunk_size,
 					  istart, iend);
     case GFS_GUIDED:
       return gomp_loop_ull_guided_start (up, start, end, incr,
-					 icv->run_sched_modifier,
+					 icv->run_sched_chunk_size,
 					 istart, iend);
     case GFS_AUTO:
       /* For now map to schedule(static), later on we could play with feedback
@@ -279,15 +279,15 @@ GOMP_loop_ull_ordered_runtime_start (bool up, gomp_ull start, gomp_ull end,
     {
     case GFS_STATIC:
       return gomp_loop_ull_ordered_static_start (up, start, end, incr,
-						 icv->run_sched_modifier,
+						 icv->run_sched_chunk_size,
 						 istart, iend);
     case GFS_DYNAMIC:
       return gomp_loop_ull_ordered_dynamic_start (up, start, end, incr,
-						  icv->run_sched_modifier,
+						  icv->run_sched_chunk_size,
 						  istart, iend);
     case GFS_GUIDED:
       return gomp_loop_ull_ordered_guided_start (up, start, end, incr,
-						 icv->run_sched_modifier,
+						 icv->run_sched_chunk_size,
 						 istart, iend);
     case GFS_AUTO:
       /* For now map to schedule(static), later on we could play with feedback
@@ -299,6 +299,114 @@ GOMP_loop_ull_ordered_runtime_start (bool up, gomp_ull start, gomp_ull end,
     }
 }
 
+/* The *_doacross_*_start routines are similar.  The only difference is that
+   this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
+   section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
+   and other COUNTS array elements tell the library number of iterations
+   in the ordered inner loops.  */
+
+static bool
+gomp_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts,
+				     gomp_ull chunk_size, gomp_ull *istart,
+				     gomp_ull *iend)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  thr->ts.static_trip = 0;
+  if (gomp_work_share_start (false))
+    {
+      gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+			  GFS_STATIC, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size);
+      gomp_work_share_init_done ();
+    }
+
+  return !gomp_iter_ull_static_next (istart, iend);
+}
+
+static bool
+gomp_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts,
+				      gomp_ull chunk_size, gomp_ull *istart,
+				      gomp_ull *iend)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  bool ret;
+
+  if (gomp_work_share_start (false))
+    {
+      gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+			  GFS_DYNAMIC, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size);
+      gomp_work_share_init_done ();
+    }
+
+#if defined HAVE_SYNC_BUILTINS && defined __LP64__
+  ret = gomp_iter_ull_dynamic_next (istart, iend);
+#else
+  gomp_mutex_lock (&thr->ts.work_share->lock);
+  ret = gomp_iter_ull_dynamic_next_locked (istart, iend);
+  gomp_mutex_unlock (&thr->ts.work_share->lock);
+#endif
+
+  return ret;
+}
+
+static bool
+gomp_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts,
+				     gomp_ull chunk_size, gomp_ull *istart,
+				     gomp_ull *iend)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  bool ret;
+
+  if (gomp_work_share_start (false))
+    {
+      gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+			  GFS_GUIDED, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size);
+      gomp_work_share_init_done ();
+    }
+
+#if defined HAVE_SYNC_BUILTINS && defined __LP64__
+  ret = gomp_iter_ull_guided_next (istart, iend);
+#else
+  gomp_mutex_lock (&thr->ts.work_share->lock);
+  ret = gomp_iter_ull_guided_next_locked (istart, iend);
+  gomp_mutex_unlock (&thr->ts.work_share->lock);
+#endif
+
+  return ret;
+}
+
+bool
+GOMP_loop_ull_doacross_runtime_start (unsigned ncounts, gomp_ull *counts,
+				      gomp_ull *istart, gomp_ull *iend)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  switch (icv->run_sched_var)
+    {
+    case GFS_STATIC:
+      return gomp_loop_ull_doacross_static_start (ncounts, counts,
+						  icv->run_sched_chunk_size,
+						  istart, iend);
+    case GFS_DYNAMIC:
+      return gomp_loop_ull_doacross_dynamic_start (ncounts, counts,
+						   icv->run_sched_chunk_size,
+						   istart, iend);
+    case GFS_GUIDED:
+      return gomp_loop_ull_doacross_guided_start (ncounts, counts,
+						  icv->run_sched_chunk_size,
+						  istart, iend);
+    case GFS_AUTO:
+      /* For now map to schedule(static), later on we could play with feedback
+	 driven choice.  */
+      return gomp_loop_ull_doacross_static_start (ncounts, counts,
+						  0, istart, iend);
+    default:
+      abort ();
+    }
+}
+
 /* The *_next routines are called when the thread completes processing of
    the iteration block currently assigned to it.  If the work-share
    construct is bound directly to a parallel construct, then the iteration
@@ -466,6 +574,13 @@ extern __typeof(gomp_loop_ull_ordered_dynamic_start) GOMP_loop_ull_ordered_dynam
 extern __typeof(gomp_loop_ull_ordered_guided_start) GOMP_loop_ull_ordered_guided_start
 	__attribute__((alias ("gomp_loop_ull_ordered_guided_start")));
 
+extern __typeof(gomp_loop_ull_doacross_static_start) GOMP_loop_ull_doacross_static_start
+	__attribute__((alias ("gomp_loop_ull_doacross_static_start")));
+extern __typeof(gomp_loop_ull_doacross_dynamic_start) GOMP_loop_ull_doacross_dynamic_start
+	__attribute__((alias ("gomp_loop_ull_doacross_dynamic_start")));
+extern __typeof(gomp_loop_ull_doacross_guided_start) GOMP_loop_ull_doacross_guided_start
+	__attribute__((alias ("gomp_loop_ull_doacross_guided_start")));
+
 extern __typeof(gomp_loop_ull_static_next) GOMP_loop_ull_static_next
 	__attribute__((alias ("gomp_loop_ull_static_next")));
 extern __typeof(gomp_loop_ull_dynamic_next) GOMP_loop_ull_dynamic_next
@@ -535,6 +650,33 @@ GOMP_loop_ull_ordered_guided_start (bool up, gomp_ull start, gomp_ull end,
 }
 
 bool
+GOMP_loop_ull_doacross_static_start (unsigned ncounts, gomp_ull *counts,
+				     gomp_ull chunk_size, gomp_ull *istart,
+				     gomp_ull *iend)
+{
+  return gomp_loop_ull_doacross_static_start (ncounts, counts, chunk_size,
+					      istart, iend);
+}
+
+bool
+GOMP_loop_ull_doacross_dynamic_start (unsigned ncounts, gomp_ull *counts,
+				      gomp_ull chunk_size, gomp_ull *istart,
+				      gomp_ull *iend)
+{
+  return gomp_loop_ull_doacross_dynamic_start (ncounts, counts, chunk_size,
+					       istart, iend);
+}
+
+bool
+GOMP_loop_ull_doacross_guided_start (unsigned ncounts, gomp_ull *counts,
+				     gomp_ull chunk_size, gomp_ull *istart,
+				     gomp_ull *iend)
+{
+  return gomp_loop_ull_doacross_guided_start (ncounts, counts, chunk_size,
+					      istart, iend);
+}
+
+bool
 GOMP_loop_ull_static_next (gomp_ull *istart, gomp_ull *iend)
 {
   return gomp_loop_ull_static_next (istart, iend);
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index 90d43eb2b8a..af067d6e73c 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -289,7 +289,8 @@ acc_map_data (void *h, void *d, size_t s)
       if (d != h)
         gomp_fatal ("cannot map data on shared-memory system");
 
-      tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true, false);
+      tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true,
+			   GOMP_MAP_VARS_OPENACC);
     }
   else
     {
@@ -318,7 +319,7 @@ acc_map_data (void *h, void *d, size_t s)
       gomp_mutex_unlock (&acc_dev->lock);
 
       tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes,
-			   &kinds, true, false);
+			   &kinds, true, GOMP_MAP_VARS_OPENACC);
     }
 
   gomp_mutex_lock (&acc_dev->lock);
@@ -447,7 +448,7 @@ present_create_copy (unsigned f, void *h, size_t s)
       gomp_mutex_unlock (&acc_dev->lock);
 
       tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true,
-			   false);
+			   GOMP_MAP_VARS_OPENACC);
 
       gomp_mutex_lock (&acc_dev->lock);
 
@@ -594,7 +595,7 @@ gomp_acc_insert_pointer (size_t mapnum, void **hostaddrs, size_t *sizes,
 
   gomp_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
   tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs,
-		       NULL, sizes, kinds, true, false);
+		       NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
   gomp_debug (0, "  %s: mappings prepared\n", __FUNCTION__);
 
   gomp_mutex_lock (&acc_dev->lock);
@@ -651,7 +652,7 @@ gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum)
     }
 
   if (force_copyfrom)
-    t->list[0]->copy_from = 1;
+    t->list[0].copy_from = 1;
 
   gomp_mutex_unlock (&acc_dev->lock);
 
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index e31bc0a7bc6..b150106981e 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -168,12 +168,12 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
     tgt_fn = (void (*)) fn;
 
   tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true,
-		       false);
+		       GOMP_MAP_VARS_OPENACC);
 
   devaddrs = gomp_alloca (sizeof (void *) * mapnum);
   for (i = 0; i < mapnum; i++)
-    devaddrs[i] = (void *) (tgt->list[i]->tgt->tgt_start
-			    + tgt->list[i]->tgt_offset);
+    devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start
+			    + tgt->list[i].key->tgt_offset);
 
   acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, sizes,
 			      kinds, async, dims, tgt);
@@ -228,7 +228,8 @@ GOACC_data_start (int device, size_t mapnum,
   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
       || host_fallback)
     {
-      tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true, false);
+      tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true,
+			   GOMP_MAP_VARS_OPENACC);
       tgt->prev = thr->mapped_data;
       thr->mapped_data = tgt;
 
@@ -237,7 +238,7 @@ GOACC_data_start (int device, size_t mapnum,
 
   gomp_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
   tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true,
-		       false);
+		       GOMP_MAP_VARS_OPENACC);
   gomp_debug (0, "  %s: mappings prepared\n", __FUNCTION__);
   tgt->prev = thr->mapped_data;
   thr->mapped_data = tgt;
diff --git a/libgomp/omp.h.in b/libgomp/omp.h.in
index dac3e8ad6ef..090498ad784 100644
--- a/libgomp/omp.h.in
+++ b/libgomp/omp.h.in
@@ -62,6 +62,15 @@ typedef enum omp_proc_bind_t
   omp_proc_bind_spread = 4
 } omp_proc_bind_t;
 
+typedef enum omp_lock_hint_t
+{
+  omp_lock_hint_none = 0,
+  omp_lock_hint_uncontended = 1,
+  omp_lock_hint_contended = 2,
+  omp_lock_hint_nonspeculative = 4,
+  omp_lock_hint_speculative = 8,
+} omp_lock_hint_t;
+
 #ifdef __cplusplus
 extern "C" {
 # define __GOMP_NOTHROW throw ()
@@ -84,12 +93,16 @@ extern void omp_set_nested (int) __GOMP_NOTHROW;
 extern int omp_get_nested (void) __GOMP_NOTHROW;
 
 extern void omp_init_lock (omp_lock_t *) __GOMP_NOTHROW;
+extern void omp_init_lock_with_hint (omp_lock_t *, omp_lock_hint_t)
+  __GOMP_NOTHROW;
 extern void omp_destroy_lock (omp_lock_t *) __GOMP_NOTHROW;
 extern void omp_set_lock (omp_lock_t *) __GOMP_NOTHROW;
 extern void omp_unset_lock (omp_lock_t *) __GOMP_NOTHROW;
 extern int omp_test_lock (omp_lock_t *) __GOMP_NOTHROW;
 
 extern void omp_init_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW;
+extern void omp_init_nest_lock_with_hint (omp_lock_t *, omp_lock_hint_t)
+  __GOMP_NOTHROW;
 extern void omp_destroy_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW;
 extern void omp_set_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW;
 extern void omp_unset_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW;
@@ -112,6 +125,12 @@ extern int omp_in_final (void) __GOMP_NOTHROW;
 
 extern int omp_get_cancellation (void) __GOMP_NOTHROW;
 extern omp_proc_bind_t omp_get_proc_bind (void) __GOMP_NOTHROW;
+extern int omp_get_num_places (void) __GOMP_NOTHROW;
+extern int omp_get_place_num_procs (int) __GOMP_NOTHROW;
+extern void omp_get_place_proc_ids (int, int *) __GOMP_NOTHROW;
+extern int omp_get_place_num (void) __GOMP_NOTHROW;
+extern int omp_get_partition_num_places (void) __GOMP_NOTHROW;
+extern void omp_get_partition_place_nums (int *) __GOMP_NOTHROW;
 
 extern void omp_set_default_device (int) __GOMP_NOTHROW;
 extern int omp_get_default_device (void) __GOMP_NOTHROW;
@@ -120,6 +139,24 @@ extern int omp_get_num_teams (void) __GOMP_NOTHROW;
 extern int omp_get_team_num (void) __GOMP_NOTHROW;
 
 extern int omp_is_initial_device (void) __GOMP_NOTHROW;
+extern int omp_get_initial_device (void) __GOMP_NOTHROW;
+extern int omp_get_max_task_priority (void) __GOMP_NOTHROW;
+
+extern void *omp_target_alloc (__SIZE_TYPE__, int) __GOMP_NOTHROW;
+extern void omp_target_free (void *, int) __GOMP_NOTHROW;
+extern int omp_target_is_present (void *, int) __GOMP_NOTHROW;
+extern int omp_target_memcpy (void *, void *, __SIZE_TYPE__, __SIZE_TYPE__,
+			      __SIZE_TYPE__, int, int) __GOMP_NOTHROW;
+extern int omp_target_memcpy_rect (void *, void *, __SIZE_TYPE__, int,
+				   const __SIZE_TYPE__ *,
+				   const __SIZE_TYPE__ *,
+				   const __SIZE_TYPE__ *,
+				   const __SIZE_TYPE__ *,
+				   const __SIZE_TYPE__ *, int, int)
+  __GOMP_NOTHROW;
+extern int omp_target_associate_ptr (void *, void *, __SIZE_TYPE__,
+				     __SIZE_TYPE__, int) __GOMP_NOTHROW;
+extern int omp_target_disassociate_ptr (void *, int) __GOMP_NOTHROW;
 
 #ifdef __cplusplus
 }
diff --git a/libgomp/omp_lib.f90.in b/libgomp/omp_lib.f90.in
index 122563e625a..28df9c1664e 100644
--- a/libgomp/omp_lib.f90.in
+++ b/libgomp/omp_lib.f90.in
@@ -29,15 +29,31 @@
         integer, parameter :: omp_nest_lock_kind = @OMP_NEST_LOCK_KIND@
         integer, parameter :: omp_sched_kind = 4
         integer, parameter :: omp_proc_bind_kind = 4
+        integer, parameter :: omp_lock_hint_kind = 4
         integer (omp_sched_kind), parameter :: omp_sched_static = 1
         integer (omp_sched_kind), parameter :: omp_sched_dynamic = 2
         integer (omp_sched_kind), parameter :: omp_sched_guided = 3
         integer (omp_sched_kind), parameter :: omp_sched_auto = 4
-        integer (omp_proc_bind_kind), parameter :: omp_proc_bind_false = 0
-        integer (omp_proc_bind_kind), parameter :: omp_proc_bind_true = 1
-        integer (omp_proc_bind_kind), parameter :: omp_proc_bind_master = 2
-        integer (omp_proc_bind_kind), parameter :: omp_proc_bind_close = 3
-        integer (omp_proc_bind_kind), parameter :: omp_proc_bind_spread = 4
+        integer (omp_proc_bind_kind), &
+                 parameter :: omp_proc_bind_false = 0
+        integer (omp_proc_bind_kind), &
+                 parameter :: omp_proc_bind_true = 1
+        integer (omp_proc_bind_kind), &
+                 parameter :: omp_proc_bind_master = 2
+        integer (omp_proc_bind_kind), &
+                 parameter :: omp_proc_bind_close = 3
+        integer (omp_proc_bind_kind), &
+                 parameter :: omp_proc_bind_spread = 4
+        integer (omp_lock_hint_kind), &
+                 parameter :: omp_lock_hint_none = 0
+        integer (omp_lock_hint_kind), &
+                 parameter :: omp_lock_hint_uncontended = 1
+        integer (omp_lock_hint_kind), &
+                 parameter :: omp_lock_hint_contended = 2
+        integer (omp_lock_hint_kind), &
+                 parameter :: omp_lock_hint_nonspeculative = 4
+        integer (omp_lock_hint_kind), &
+                 parameter :: omp_lock_hint_speculative = 8
       end module
 
       module omp_lib
@@ -53,6 +69,14 @@
         end interface
 
         interface
+          subroutine omp_init_lock_with_hint (svar, hint)
+            use omp_lib_kinds
+            integer (omp_lock_kind), intent (out) :: svar
+            integer (omp_lock_hint_kind), intent (in) :: hint
+          end subroutine omp_init_lock_with_hint
+        end interface
+
+        interface
           subroutine omp_init_nest_lock (nvar)
             use omp_lib_kinds
             integer (omp_nest_lock_kind), intent (out) :: nvar
@@ -60,6 +84,14 @@
         end interface
 
         interface
+          subroutine omp_init_nest_lock_with_hint (nvar, hint)
+            use omp_lib_kinds
+            integer (omp_nest_lock_kind), intent (out) :: nvar
+            integer (omp_lock_hint_kind), intent (in) :: hint
+          end subroutine omp_init_nest_lock_with_hint
+        end interface
+
+        interface
           subroutine omp_destroy_lock (svar)
             use omp_lib_kinds
             integer (omp_lock_kind), intent (inout) :: svar
@@ -199,28 +231,28 @@
         end interface
 
         interface omp_set_schedule
-          subroutine omp_set_schedule (kind, modifier)
+          subroutine omp_set_schedule (kind, chunk_size)
             use omp_lib_kinds
             integer (omp_sched_kind), intent (in) :: kind
-            integer (4), intent (in) :: modifier
+            integer (4), intent (in) :: chunk_size
           end subroutine omp_set_schedule
-          subroutine omp_set_schedule_8 (kind, modifier)
+          subroutine omp_set_schedule_8 (kind, chunk_size)
             use omp_lib_kinds
             integer (omp_sched_kind), intent (in) :: kind
-            integer (8), intent (in) :: modifier
+            integer (8), intent (in) :: chunk_size
           end subroutine omp_set_schedule_8
          end interface
 
         interface omp_get_schedule
-          subroutine omp_get_schedule (kind, modifier)
+          subroutine omp_get_schedule (kind, chunk_size)
             use omp_lib_kinds
             integer (omp_sched_kind), intent (out) :: kind
-            integer (4), intent (out) :: modifier
+            integer (4), intent (out) :: chunk_size
           end subroutine omp_get_schedule
-          subroutine omp_get_schedule_8 (kind, modifier)
+          subroutine omp_get_schedule_8 (kind, chunk_size)
             use omp_lib_kinds
             integer (omp_sched_kind), intent (out) :: kind
-            integer (8), intent (out) :: modifier
+            integer (8), intent (out) :: chunk_size
           end subroutine omp_get_schedule_8
          end interface
 
@@ -298,6 +330,58 @@
           end function omp_get_proc_bind
         end interface
 
+        interface
+          function omp_get_num_places ()
+            integer (4) :: omp_get_num_places
+          end function omp_get_num_places
+        end interface
+
+        interface omp_get_place_num_procs
+          function omp_get_place_num_procs (place_num)
+            integer (4), intent(in) :: place_num
+            integer (4) :: omp_get_place_num_procs
+          end function omp_get_place_num_procs
+
+          function omp_get_place_num_procs_8 (place_num)
+            integer (8), intent(in) :: place_num
+            integer (4) :: omp_get_place_num_procs_8
+          end function omp_get_place_num_procs_8
+        end interface
+
+        interface omp_get_place_proc_ids
+          subroutine omp_get_place_proc_ids (place_num, ids)
+            integer (4), intent(in) :: place_num
+            integer (4), intent(out) :: ids(*)
+          end subroutine omp_get_place_proc_ids
+
+          subroutine omp_get_place_proc_ids_8 (place_num, ids)
+            integer (8), intent(in) :: place_num
+            integer (8), intent(out) :: ids(*)
+          end subroutine omp_get_place_proc_ids_8
+        end interface
+
+        interface
+          function omp_get_place_num ()
+            integer (4) :: omp_get_place_num
+          end function omp_get_place_num
+        end interface
+
+        interface
+          function omp_get_partition_num_places ()
+            integer (4) :: omp_get_partition_num_places
+          end function omp_get_partition_num_places
+        end interface
+
+        interface omp_get_partition_place_nums
+          subroutine omp_get_partition_place_nums (place_nums)
+            integer (4), intent(out) :: place_nums(*)
+          end subroutine omp_get_partition_place_nums
+
+          subroutine omp_get_partition_place_nums_8 (place_nums)
+            integer (8), intent(out) :: place_nums(*)
+          end subroutine omp_get_partition_place_nums_8
+        end interface
+
         interface omp_set_default_device
           subroutine omp_set_default_device (device_num)
             integer (4), intent (in) :: device_num
@@ -337,4 +421,16 @@
           end function omp_is_initial_device
         end interface
 
+        interface
+          function omp_get_initial_device ()
+            integer (4) :: omp_get_initial_device
+          end function omp_get_initial_device
+        end interface
+
+        interface
+          function omp_get_max_task_priority ()
+            integer (4) :: omp_get_max_task_priority
+          end function omp_get_max_task_priority
+        end interface
+
       end module omp_lib
diff --git a/libgomp/omp_lib.h.in b/libgomp/omp_lib.h.in
index d590bc15135..81662424500 100644
--- a/libgomp/omp_lib.h.in
+++ b/libgomp/omp_lib.h.in
@@ -46,9 +46,23 @@
       parameter (omp_proc_bind_master = 2)
       parameter (omp_proc_bind_close = 3)
       parameter (omp_proc_bind_spread = 4)
+      integer omp_lock_hint_kind
+      parameter (omp_lock_hint_kind = 4)
+      integer (omp_lock_hint_kind) omp_lock_hint_none
+      integer (omp_lock_hint_kind) omp_lock_hint_uncontended
+      integer (omp_lock_hint_kind) omp_lock_hint_contended
+      integer (omp_lock_hint_kind) omp_lock_hint_nonspeculative
+      integer (omp_lock_hint_kind) omp_lock_hint_speculative
+      parameter (omp_lock_hint_none = 0)
+      parameter (omp_lock_hint_uncontended = 1)
+      parameter (omp_lock_hint_contended = 2)
+      parameter (omp_lock_hint_nonspeculative = 4)
+      parameter (omp_lock_hint_speculative = 8)
       parameter (openmp_version = 201307)
 
       external omp_init_lock, omp_init_nest_lock
+      external omp_init_lock_with_hint
+      external omp_init_nest_lock_with_hint
       external omp_destroy_lock, omp_destroy_nest_lock
       external omp_set_lock, omp_set_nest_lock
       external omp_unset_lock, omp_unset_nest_lock
@@ -88,6 +102,17 @@
       external omp_get_proc_bind
       integer(omp_proc_bind_kind) omp_get_proc_bind
 
+      integer(4) omp_get_num_places
+      external omp_get_num_places
+      integer(4) omp_get_place_num_procs
+      external omp_get_place_num_procs
+      external omp_get_place_proc_ids
+      integer(4) omp_get_place_num
+      external omp_get_place_num
+      integer(4) omp_get_partition_num_places
+      external omp_get_partition_num_places
+      external omp_get_partition_place_nums
+
       external omp_set_default_device, omp_get_default_device
       external omp_get_num_devices, omp_get_num_teams
       external omp_get_team_num
@@ -96,3 +121,8 @@
 
       external omp_is_initial_device
       logical(4) omp_is_initial_device
+      external omp_get_initial_device
+      integer(4) omp_get_initial_device
+
+      external omp_get_max_task_priority
+      integer(4) omp_get_max_task_priority
diff --git a/libgomp/ordered.c b/libgomp/ordered.c
index 69ca217b4d5..fdac3ee8f58 100644
--- a/libgomp/ordered.c
+++ b/libgomp/ordered.c
@@ -26,6 +26,9 @@
 /* This file handles the ORDERED construct.  */
 
 #include "libgomp.h"
+#include <stdarg.h>
+#include <string.h>
+#include "doacross.h"
 
 
 /* This function is called when first allocating an iteration block.  That
@@ -250,3 +253,521 @@ void
 GOMP_ordered_end (void)
 {
 }
+
+/* DOACROSS initialization.  */
+
+#define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
+
+void
+gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_work_share *ws = thr->ts.work_share;
+  unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
+  unsigned long ent, num_ents, elt_sz, shift_sz;
+  struct gomp_doacross_work_share *doacross;
+
+  if (team == NULL || team->nthreads == 1)
+    return;
+
+  for (i = 0; i < ncounts; i++)
+    {
+      /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
+      if (counts[i] == 0)
+	return;
+
+      if (num_bits <= MAX_COLLAPSED_BITS)
+	{
+	  unsigned int this_bits;
+	  if (counts[i] == 1)
+	    this_bits = 1;
+	  else
+	    this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
+			- __builtin_clzl (counts[i] - 1);
+	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
+	    {
+	      bits[i] = this_bits;
+	      num_bits += this_bits;
+	    }
+	  else
+	    num_bits = MAX_COLLAPSED_BITS + 1;
+	}
+    }
+
+  if (ws->sched == GFS_STATIC)
+    num_ents = team->nthreads;
+  else
+    num_ents = (counts[0] - 1) / chunk_size + 1;
+  if (num_bits <= MAX_COLLAPSED_BITS)
+    {
+      elt_sz = sizeof (unsigned long);
+      shift_sz = ncounts * sizeof (unsigned int);
+    }
+  else
+    {
+      elt_sz = sizeof (unsigned long) * ncounts;
+      shift_sz = 0;
+    }
+  elt_sz = (elt_sz + 63) & ~63UL;
+
+  doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
+			  + shift_sz);
+  doacross->chunk_size = chunk_size;
+  doacross->elt_sz = elt_sz;
+  doacross->ncounts = ncounts;
+  doacross->flattened = false;
+  doacross->array = (unsigned char *)
+		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
+		     & ~(uintptr_t) 63);
+  if (num_bits <= MAX_COLLAPSED_BITS)
+    {
+      unsigned int shift_count = 0;
+      doacross->flattened = true;
+      for (i = ncounts; i > 0; i--)
+	{
+	  doacross->shift_counts[i - 1] = shift_count;
+	  shift_count += bits[i - 1];
+	}
+      for (ent = 0; ent < num_ents; ent++)
+	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
+    }
+  else
+    for (ent = 0; ent < num_ents; ent++)
+      memset (doacross->array + ent * elt_sz, '\0',
+	      sizeof (unsigned long) * ncounts);
+  if (ws->sched == GFS_STATIC && chunk_size == 0)
+    {
+      unsigned long q = counts[0] / num_ents;
+      unsigned long t = counts[0] % num_ents;
+      doacross->boundary = t * (q + 1);
+      doacross->q = q;
+      doacross->t = t;
+    }
+  ws->doacross = doacross;
+}
+
+/* DOACROSS POST operation.  */
+
+void
+GOMP_doacross_post (long *counts)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_work_share *ws = thr->ts.work_share;
+  struct gomp_doacross_work_share *doacross = ws->doacross;
+  unsigned long ent;
+  unsigned int i;
+
+  if (__builtin_expect (doacross == NULL, 0))
+    {
+      __sync_synchronize ();
+      return;
+    }
+
+  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+    ent = thr->ts.team_id;
+  else
+    ent = counts[0] / doacross->chunk_size;
+  unsigned long *array = (unsigned long *) (doacross->array
+					    + ent * doacross->elt_sz);
+
+  if (__builtin_expect (doacross->flattened, 1))
+    {
+      unsigned long flattened
+	= (unsigned long) counts[0] << doacross->shift_counts[0];
+
+      for (i = 1; i < doacross->ncounts; i++)
+	flattened |= (unsigned long) counts[i]
+		     << doacross->shift_counts[i];
+      flattened++;
+      if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
+	__atomic_thread_fence (MEMMODEL_RELEASE);
+      else
+	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
+      return;
+    }
+
+  __atomic_thread_fence (MEMMODEL_ACQUIRE);
+  for (i = doacross->ncounts; i-- > 0; )
+    {
+      if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
+	__atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
+    }
+}
+
+/* DOACROSS WAIT operation.  */
+
+void
+GOMP_doacross_wait (long first, ...)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_work_share *ws = thr->ts.work_share;
+  struct gomp_doacross_work_share *doacross = ws->doacross;
+  va_list ap;
+  unsigned long ent;
+  unsigned int i;
+
+  if (__builtin_expect (doacross == NULL, 0))
+    {
+      __sync_synchronize ();
+      return;
+    }
+
+  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+    {
+      if (ws->chunk_size == 0)
+	{
+	  if (first < doacross->boundary)
+	    ent = first / (doacross->q + 1);
+	  else
+	    ent = (first - doacross->boundary) / doacross->q
+		  + doacross->t;
+	}
+      else
+	ent = first / ws->chunk_size % thr->ts.team->nthreads;
+    }
+  else
+    ent = first / doacross->chunk_size;
+  unsigned long *array = (unsigned long *) (doacross->array
+					    + ent * doacross->elt_sz);
+
+  if (__builtin_expect (doacross->flattened, 1))
+    {
+      unsigned long flattened
+	= (unsigned long) first << doacross->shift_counts[0];
+      unsigned long cur;
+
+      va_start (ap, first);
+      for (i = 1; i < doacross->ncounts; i++)
+	flattened |= (unsigned long) va_arg (ap, long)
+		     << doacross->shift_counts[i];
+      cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
+      if (flattened < cur)
+	{
+	  __atomic_thread_fence (MEMMODEL_RELEASE);
+	  va_end (ap);
+	  return;
+	}
+      doacross_spin (array, flattened, cur);
+      __atomic_thread_fence (MEMMODEL_RELEASE);
+      va_end (ap);
+      return;
+    }
+
+  do
+    {
+      va_start (ap, first);
+      for (i = 0; i < doacross->ncounts; i++)
+	{
+	  unsigned long thisv
+	    = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
+	  unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
+	  if (thisv < cur)
+	    {
+	      i = doacross->ncounts;
+	      break;
+	    }
+	  if (thisv > cur)
+	    break;
+	}
+      va_end (ap);
+      if (i == doacross->ncounts)
+	break;
+      cpu_relax ();
+    }
+  while (1);
+  __sync_synchronize ();
+}
+
+typedef unsigned long long gomp_ull;
+
+void
+gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_work_share *ws = thr->ts.work_share;
+  unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
+  unsigned long ent, num_ents, elt_sz, shift_sz;
+  struct gomp_doacross_work_share *doacross;
+
+  if (team == NULL || team->nthreads == 1)
+    return;
+
+  for (i = 0; i < ncounts; i++)
+    {
+      /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
+      if (counts[i] == 0)
+	return;
+
+      if (num_bits <= MAX_COLLAPSED_BITS)
+	{
+	  unsigned int this_bits;
+	  if (counts[i] == 1)
+	    this_bits = 1;
+	  else
+	    this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
+			- __builtin_clzll (counts[i] - 1);
+	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
+	    {
+	      bits[i] = this_bits;
+	      num_bits += this_bits;
+	    }
+	  else
+	    num_bits = MAX_COLLAPSED_BITS + 1;
+	}
+    }
+
+  if (ws->sched == GFS_STATIC)
+    num_ents = team->nthreads;
+  else
+    num_ents = (counts[0] - 1) / chunk_size + 1;
+  if (num_bits <= MAX_COLLAPSED_BITS)
+    {
+      elt_sz = sizeof (unsigned long);
+      shift_sz = ncounts * sizeof (unsigned int);
+    }
+  else
+    {
+      if (sizeof (gomp_ull) == sizeof (unsigned long))
+	elt_sz = sizeof (gomp_ull) * ncounts;
+      else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
+	elt_sz = sizeof (unsigned long) * 2 * ncounts;
+      else
+	abort ();
+      shift_sz = 0;
+    }
+  elt_sz = (elt_sz + 63) & ~63UL;
+
+  doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
+			  + shift_sz);
+  doacross->chunk_size_ull = chunk_size;
+  doacross->elt_sz = elt_sz;
+  doacross->ncounts = ncounts;
+  doacross->flattened = false;
+  doacross->boundary = 0;
+  doacross->array = (unsigned char *)
+		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
+		     & ~(uintptr_t) 63);
+  if (num_bits <= MAX_COLLAPSED_BITS)
+    {
+      unsigned int shift_count = 0;
+      doacross->flattened = true;
+      for (i = ncounts; i > 0; i--)
+	{
+	  doacross->shift_counts[i - 1] = shift_count;
+	  shift_count += bits[i - 1];
+	}
+      for (ent = 0; ent < num_ents; ent++)
+	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
+    }
+  else
+    for (ent = 0; ent < num_ents; ent++)
+      memset (doacross->array + ent * elt_sz, '\0',
+	      sizeof (unsigned long) * ncounts);
+  if (ws->sched == GFS_STATIC && chunk_size == 0)
+    {
+      gomp_ull q = counts[0] / num_ents;
+      gomp_ull t = counts[0] % num_ents;
+      doacross->boundary_ull = t * (q + 1);
+      doacross->q_ull = q;
+      doacross->t = t;
+    }
+  ws->doacross = doacross;
+}
+
+/* DOACROSS POST operation.  */
+
+void
+GOMP_doacross_ull_post (gomp_ull *counts)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_work_share *ws = thr->ts.work_share;
+  struct gomp_doacross_work_share *doacross = ws->doacross;
+  unsigned long ent;
+  unsigned int i;
+
+  if (__builtin_expect (doacross == NULL, 0))
+    {
+      __sync_synchronize ();
+      return;
+    }
+
+  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+    ent = thr->ts.team_id;
+  else
+    ent = counts[0] / doacross->chunk_size_ull;
+
+  if (__builtin_expect (doacross->flattened, 1))
+    {
+      unsigned long *array = (unsigned long *) (doacross->array
+			      + ent * doacross->elt_sz);
+      gomp_ull flattened
+	= counts[0] << doacross->shift_counts[0];
+
+      for (i = 1; i < doacross->ncounts; i++)
+	flattened |= counts[i] << doacross->shift_counts[i];
+      flattened++;
+      if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
+	__atomic_thread_fence (MEMMODEL_RELEASE);
+      else
+	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
+      return;
+    }
+
+  __atomic_thread_fence (MEMMODEL_ACQUIRE);
+  if (sizeof (gomp_ull) == sizeof (unsigned long))
+    {
+      gomp_ull *array = (gomp_ull *) (doacross->array
+				      + ent * doacross->elt_sz);
+
+      for (i = doacross->ncounts; i-- > 0; )
+	{
+	  if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
+	    __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
+	}
+    }
+  else
+    {
+      unsigned long *array = (unsigned long *) (doacross->array
+						+ ent * doacross->elt_sz);
+
+      for (i = doacross->ncounts; i-- > 0; )
+	{
+	  gomp_ull cull = counts[i] + 1UL;
+	  unsigned long c = (unsigned long) cull;
+	  if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
+	    __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
+	  c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
+	  if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
+	    __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
+	}
+    }
+}
+
+/* DOACROSS WAIT operation.  */
+
+void
+GOMP_doacross_ull_wait (gomp_ull first, ...)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_work_share *ws = thr->ts.work_share;
+  struct gomp_doacross_work_share *doacross = ws->doacross;
+  va_list ap;
+  unsigned long ent;
+  unsigned int i;
+
+  if (__builtin_expect (doacross == NULL, 0))
+    {
+      __sync_synchronize ();
+      return;
+    }
+
+  if (__builtin_expect (ws->sched == GFS_STATIC, 1))
+    {
+      if (ws->chunk_size_ull == 0)
+	{
+	  if (first < doacross->boundary_ull)
+	    ent = first / (doacross->q_ull + 1);
+	  else
+	    ent = (first - doacross->boundary_ull) / doacross->q_ull
+		  + doacross->t;
+	}
+      else
+	ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
+    }
+  else
+    ent = first / doacross->chunk_size_ull;
+
+  if (__builtin_expect (doacross->flattened, 1))
+    {
+      unsigned long *array = (unsigned long *) (doacross->array
+						+ ent * doacross->elt_sz);
+      gomp_ull flattened = first << doacross->shift_counts[0];
+      unsigned long cur;
+
+      va_start (ap, first);
+      for (i = 1; i < doacross->ncounts; i++)
+	flattened |= va_arg (ap, gomp_ull)
+		     << doacross->shift_counts[i];
+      cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
+      if (flattened < cur)
+	{
+	  __atomic_thread_fence (MEMMODEL_RELEASE);
+	  va_end (ap);
+	  return;
+	}
+      doacross_spin (array, flattened, cur);
+      __atomic_thread_fence (MEMMODEL_RELEASE);
+      va_end (ap);
+      return;
+    }
+
+  if (sizeof (gomp_ull) == sizeof (unsigned long))
+    {
+      gomp_ull *array = (gomp_ull *) (doacross->array
+				      + ent * doacross->elt_sz);
+      do
+	{
+	  va_start (ap, first);
+	  for (i = 0; i < doacross->ncounts; i++)
+	    {
+	      gomp_ull thisv
+		= (i ? va_arg (ap, gomp_ull) : first) + 1;
+	      gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
+	      if (thisv < cur)
+		{
+		  i = doacross->ncounts;
+		  break;
+		}
+	      if (thisv > cur)
+		break;
+	    }
+	  va_end (ap);
+	  if (i == doacross->ncounts)
+	    break;
+	  cpu_relax ();
+	}
+      while (1);
+    }
+  else
+    {
+      unsigned long *array = (unsigned long *) (doacross->array
+						+ ent * doacross->elt_sz);
+      do
+	{
+	  va_start (ap, first);
+	  for (i = 0; i < doacross->ncounts; i++)
+	    {
+	      gomp_ull thisv
+		= (i ? va_arg (ap, gomp_ull) : first) + 1;
+	      unsigned long t
+		= thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
+	      unsigned long cur
+		= __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
+	      if (t < cur)
+		{
+		  i = doacross->ncounts;
+		  break;
+		}
+	      if (t > cur)
+		break;
+	      t = thisv;
+	      cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
+	      if (t < cur)
+		{
+		  i = doacross->ncounts;
+		  break;
+		}
+	      if (t > cur)
+		break;
+	    }
+	  va_end (ap);
+	  if (i == doacross->ncounts)
+	    break;
+	  cpu_relax ();
+	}
+      while (1);
+    }
+  __sync_synchronize ();
+}
diff --git a/libgomp/target.c b/libgomp/target.c
index 758ece5d78c..de6a2c9c9c5 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -38,6 +38,7 @@
 #endif
 #include <string.h>
 #include <assert.h>
+#include <errno.h>
 
 #ifdef PLUGIN_SUPPORT
 #include <dlfcn.h>
@@ -133,17 +134,48 @@ resolve_device (int device_id)
   if (device_id < 0 || device_id >= gomp_get_num_devices ())
     return NULL;
 
+  gomp_mutex_lock (&devices[device_id].lock);
+  if (!devices[device_id].is_initialized)
+    gomp_init_device (&devices[device_id]);
+  gomp_mutex_unlock (&devices[device_id].lock);
+
   return &devices[device_id];
 }
 
 
-/* Handle the case where splay_tree_lookup found oldn for newn.
+static inline splay_tree_key
+gomp_map_lookup (splay_tree mem_map, splay_tree_key key)
+{
+  if (key->host_start != key->host_end)
+    return splay_tree_lookup (mem_map, key);
+
+  key->host_end++;
+  splay_tree_key n = splay_tree_lookup (mem_map, key);
+  key->host_end--;
+  if (n)
+    return n;
+  key->host_start--;
+  n = splay_tree_lookup (mem_map, key);
+  key->host_start++;
+  if (n)
+    return n;
+  return splay_tree_lookup (mem_map, key);
+}
+
+/* Handle the case where gomp_map_lookup found oldn for newn.
    Helper function of gomp_map_vars.  */
 
 static inline void
 gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
-			splay_tree_key newn, unsigned char kind)
+			splay_tree_key newn, struct target_var_desc *tgt_var,
+			unsigned char kind)
 {
+  tgt_var->key = oldn;
+  tgt_var->copy_from = GOMP_MAP_COPY_FROM_P (kind);
+  tgt_var->always_copy_from = GOMP_MAP_ALWAYS_FROM_P (kind);
+  tgt_var->offset = newn->host_start - oldn->host_start;
+  tgt_var->length = newn->host_end - newn->host_start;
+
   if ((kind & GOMP_MAP_FLAG_FORCE)
       || oldn->host_start > newn->host_start
       || oldn->host_end < newn->host_end)
@@ -154,14 +186,22 @@ gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn,
 		  (void *) newn->host_start, (void *) newn->host_end,
 		  (void *) oldn->host_start, (void *) oldn->host_end);
     }
-  oldn->refcount++;
+
+  if (GOMP_MAP_ALWAYS_TO_P (kind))
+    devicep->host2dev_func (devicep->target_id,
+			    (void *) (oldn->tgt->tgt_start + oldn->tgt_offset
+				      + newn->host_start - oldn->host_start),
+			    (void *) newn->host_start,
+			    newn->host_end - newn->host_start);
+  if (oldn->refcount != REFCOUNT_INFINITY)
+    oldn->refcount++;
 }
 
 static int
-get_kind (bool is_openacc, void *kinds, int idx)
+get_kind (bool short_mapkind, void *kinds, int idx)
 {
-  return is_openacc ? ((unsigned short *) kinds)[idx]
-		    : ((unsigned char *) kinds)[idx];
+  return short_mapkind ? ((unsigned short *) kinds)[idx]
+		       : ((unsigned char *) kinds)[idx];
 }
 
 static void
@@ -185,20 +225,8 @@ gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
     }
   /* Add bias to the pointer value.  */
   cur_node.host_start += bias;
-  cur_node.host_end = cur_node.host_start + 1;
-  splay_tree_key n = splay_tree_lookup (mem_map, &cur_node);
-  if (n == NULL)
-    {
-      /* Could be possibly zero size array section.  */
-      cur_node.host_end--;
-      n = splay_tree_lookup (mem_map, &cur_node);
-      if (n == NULL)
-	{
-	  cur_node.host_start--;
-	  n = splay_tree_lookup (mem_map, &cur_node);
-	  cur_node.host_start++;
-	}
-    }
+  cur_node.host_end = cur_node.host_start;
+  splay_tree_key n = gomp_map_lookup (mem_map, &cur_node);
   if (n == NULL)
     {
       gomp_mutex_unlock (&devicep->lock);
@@ -218,20 +246,81 @@ gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
 			  sizeof (void *));
 }
 
+static void
+gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
+			  size_t first, size_t i, void **hostaddrs,
+			  size_t *sizes, void *kinds)
+{
+  struct gomp_device_descr *devicep = tgt->device_descr;
+  struct splay_tree_s *mem_map = &devicep->mem_map;
+  struct splay_tree_key_s cur_node;
+  int kind;
+  const bool short_mapkind = true;
+  const int typemask = short_mapkind ? 0xff : 0x7;
+
+  cur_node.host_start = (uintptr_t) hostaddrs[i];
+  cur_node.host_end = cur_node.host_start + sizes[i];
+  splay_tree_key n2 = splay_tree_lookup (mem_map, &cur_node);
+  kind = get_kind (short_mapkind, kinds, i);
+  if (n2
+      && n2->tgt == n->tgt
+      && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
+    {
+      gomp_map_vars_existing (devicep, n2, &cur_node,
+			      &tgt->list[i], kind & typemask);
+      return;
+    }
+  if (sizes[i] == 0)
+    {
+      if (cur_node.host_start > (uintptr_t) hostaddrs[first - 1])
+	{
+	  cur_node.host_start--;
+	  n2 = splay_tree_lookup (mem_map, &cur_node);
+	  cur_node.host_start++;
+	  if (n2
+	      && n2->tgt == n->tgt
+	      && n2->host_start - n->host_start
+		 == n2->tgt_offset - n->tgt_offset)
+	    {
+	      gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
+				      kind & typemask);
+	      return;
+	    }
+	}
+      cur_node.host_end++;
+      n2 = splay_tree_lookup (mem_map, &cur_node);
+      cur_node.host_end--;
+      if (n2
+	  && n2->tgt == n->tgt
+	  && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
+	{
+	  gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
+				  kind & typemask);
+	  return;
+	}
+    }
+  gomp_mutex_unlock (&devicep->lock);
+  gomp_fatal ("Trying to map into device [%p..%p) structure element when "
+	      "other mapped elements from the same structure weren't mapped "
+	      "together with it", (void *) cur_node.host_start,
+	      (void *) cur_node.host_end);
+}
+
 attribute_hidden struct target_mem_desc *
 gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 	       void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds,
-	       bool is_openacc, bool is_target)
+	       bool short_mapkind, enum gomp_map_vars_kind pragma_kind)
 {
   size_t i, tgt_align, tgt_size, not_found_cnt = 0;
-  const int rshift = is_openacc ? 8 : 3;
-  const int typemask = is_openacc ? 0xff : 0x7;
+  bool has_firstprivate = false;
+  const int rshift = short_mapkind ? 8 : 3;
+  const int typemask = short_mapkind ? 0xff : 0x7;
   struct splay_tree_s *mem_map = &devicep->mem_map;
   struct splay_tree_key_s cur_node;
   struct target_mem_desc *tgt
     = gomp_malloc (sizeof (*tgt) + sizeof (tgt->list[0]) * mapnum);
   tgt->list_count = mapnum;
-  tgt->refcount = 1;
+  tgt->refcount = pragma_kind == GOMP_MAP_VARS_ENTER_DATA ? 0 : 1;
   tgt->device_descr = devicep;
 
   if (mapnum == 0)
@@ -239,7 +328,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 
   tgt_align = sizeof (void *);
   tgt_size = 0;
-  if (is_target)
+  if (pragma_kind == GOMP_MAP_VARS_TARGET)
     {
       size_t align = 4 * sizeof (void *);
       tgt_align = align;
@@ -250,10 +339,61 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 
   for (i = 0; i < mapnum; i++)
     {
-      int kind = get_kind (is_openacc, kinds, i);
-      if (hostaddrs[i] == NULL)
+      int kind = get_kind (short_mapkind, kinds, i);
+      if (hostaddrs[i] == NULL
+	  || (kind & typemask) == GOMP_MAP_FIRSTPRIVATE_INT)
 	{
-	  tgt->list[i] = NULL;
+	  tgt->list[i].key = NULL;
+	  tgt->list[i].offset = ~(uintptr_t) 0;
+	  continue;
+	}
+      else if ((kind & typemask) == GOMP_MAP_USE_DEVICE_PTR)
+	{
+	  cur_node.host_start = (uintptr_t) hostaddrs[i];
+	  cur_node.host_end = cur_node.host_start;
+	  splay_tree_key n = gomp_map_lookup (mem_map, &cur_node);
+	  if (n == NULL)
+	    {
+	      gomp_mutex_unlock (&devicep->lock);
+	      gomp_fatal ("use_device_ptr pointer wasn't mapped");
+	    }
+	  cur_node.host_start -= n->host_start;
+	  hostaddrs[i]
+	    = (void *) (n->tgt->tgt_start + n->tgt_offset
+			+ cur_node.host_start);
+	  tgt->list[i].key = NULL;
+	  tgt->list[i].offset = ~(uintptr_t) 0;
+	  continue;
+	}
+      else if ((kind & typemask) == GOMP_MAP_STRUCT)
+	{
+	  size_t first = i + 1;
+	  size_t last = i + sizes[i];
+	  cur_node.host_start = (uintptr_t) hostaddrs[i];
+	  cur_node.host_end = (uintptr_t) hostaddrs[last]
+			      + sizes[last];
+	  tgt->list[i].key = NULL;
+	  tgt->list[i].offset = ~(uintptr_t) 2;
+	  splay_tree_key n = splay_tree_lookup (mem_map, &cur_node);
+	  if (n == NULL)
+	    {
+	      size_t align = (size_t) 1 << (kind >> rshift);
+	      if (tgt_align < align)
+		tgt_align = align;
+	      tgt_size -= (uintptr_t) hostaddrs[first]
+			  - (uintptr_t) hostaddrs[i];
+	      tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	      tgt_size += cur_node.host_end - (uintptr_t) hostaddrs[i];
+	      not_found_cnt += last - i;
+	      for (i = first; i <= last; i++)
+		tgt->list[i].key = NULL;
+	      i--;
+	      continue;
+	    }
+	  for (i = first; i <= last; i++)
+	    gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
+				      sizes, kinds);
+	  i--;
 	  continue;
 	}
       cur_node.host_start = (uintptr_t) hostaddrs[i];
@@ -261,15 +401,37 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 	cur_node.host_end = cur_node.host_start + sizes[i];
       else
 	cur_node.host_end = cur_node.host_start + sizeof (void *);
-      splay_tree_key n = splay_tree_lookup (mem_map, &cur_node);
-      if (n)
+      if ((kind & typemask) == GOMP_MAP_FIRSTPRIVATE)
+	{
+	  tgt->list[i].key = NULL;
+
+	  size_t align = (size_t) 1 << (kind >> rshift);
+	  if (tgt_align < align)
+	    tgt_align = align;
+	  tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	  tgt_size += cur_node.host_end - cur_node.host_start;
+	  has_firstprivate = true;
+	  continue;
+	}
+      splay_tree_key n;
+      if ((kind & typemask) == GOMP_MAP_ZERO_LEN_ARRAY_SECTION)
 	{
-	  tgt->list[i] = n;
-	  gomp_map_vars_existing (devicep, n, &cur_node, kind & typemask);
+	  n = gomp_map_lookup (mem_map, &cur_node);
+	  if (!n)
+	    {
+	      tgt->list[i].key = NULL;
+	      tgt->list[i].offset = ~(uintptr_t) 1;
+	      continue;
+	    }
 	}
       else
+	n = splay_tree_lookup (mem_map, &cur_node);
+      if (n)
+	gomp_map_vars_existing (devicep, n, &cur_node, &tgt->list[i],
+				kind & typemask);
+      else
 	{
-	  tgt->list[i] = NULL;
+	  tgt->list[i].key = NULL;
 
 	  size_t align = (size_t) 1 << (kind >> rshift);
 	  not_found_cnt++;
@@ -281,7 +443,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 	    {
 	      size_t j;
 	      for (j = i + 1; j < mapnum; j++)
-		if (!GOMP_MAP_POINTER_P (get_kind (is_openacc, kinds, j)
+		if (!GOMP_MAP_POINTER_P (get_kind (short_mapkind, kinds, j)
 					 & typemask))
 		  break;
 		else if ((uintptr_t) hostaddrs[j] < cur_node.host_start
@@ -290,7 +452,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 		  break;
 		else
 		  {
-		    tgt->list[j] = NULL;
+		    tgt->list[j].key = NULL;
 		    i++;
 		  }
 	    }
@@ -308,7 +470,7 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
       tgt->tgt_start = (uintptr_t) tgt->to_free;
       tgt->tgt_end = tgt->tgt_start + sizes[0];
     }
-  else if (not_found_cnt || is_target)
+  else if (not_found_cnt || pragma_kind == GOMP_MAP_VARS_TARGET)
     {
       /* Allocate tgt_align aligned tgt_size block of memory.  */
       /* FIXME: Perhaps change interface to allocate properly aligned
@@ -327,22 +489,74 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
     }
 
   tgt_size = 0;
-  if (is_target)
+  if (pragma_kind == GOMP_MAP_VARS_TARGET)
     tgt_size = mapnum * sizeof (void *);
 
   tgt->array = NULL;
-  if (not_found_cnt)
+  if (not_found_cnt || has_firstprivate)
     {
-      tgt->array = gomp_malloc (not_found_cnt * sizeof (*tgt->array));
+      if (not_found_cnt)
+	tgt->array = gomp_malloc (not_found_cnt * sizeof (*tgt->array));
       splay_tree_node array = tgt->array;
-      size_t j;
+      size_t j, field_tgt_offset = 0, field_tgt_clear = ~(size_t) 0;
+      uintptr_t field_tgt_base = 0;
 
       for (i = 0; i < mapnum; i++)
-	if (tgt->list[i] == NULL)
+	if (tgt->list[i].key == NULL)
 	  {
-	    int kind = get_kind (is_openacc, kinds, i);
+	    int kind = get_kind (short_mapkind, kinds, i);
 	    if (hostaddrs[i] == NULL)
 	      continue;
+	    switch (kind & typemask)
+	      {
+		size_t align, len, first, last;
+		splay_tree_key n;
+	      case GOMP_MAP_FIRSTPRIVATE:
+		align = (size_t) 1 << (kind >> rshift);
+		tgt_size = (tgt_size + align - 1) & ~(align - 1);
+		tgt->list[i].offset = tgt_size;
+		len = sizes[i];
+		devicep->host2dev_func (devicep->target_id,
+					(void *) (tgt->tgt_start + tgt_size),
+					(void *) hostaddrs[i], len);
+		tgt_size += len;
+		continue;
+	      case GOMP_MAP_FIRSTPRIVATE_INT:
+	      case GOMP_MAP_USE_DEVICE_PTR:
+	      case GOMP_MAP_ZERO_LEN_ARRAY_SECTION:
+		continue;
+	      case GOMP_MAP_STRUCT:
+		first = i + 1;
+		last = i + sizes[i];
+		cur_node.host_start = (uintptr_t) hostaddrs[i];
+		cur_node.host_end = (uintptr_t) hostaddrs[last]
+				    + sizes[last];
+		if (tgt->list[first].key != NULL)
+		  continue;
+		n = splay_tree_lookup (mem_map, &cur_node);
+		if (n == NULL)
+		  {
+		    size_t align = (size_t) 1 << (kind >> rshift);
+		    tgt_size -= (uintptr_t) hostaddrs[first]
+				- (uintptr_t) hostaddrs[i];
+		    tgt_size = (tgt_size + align - 1) & ~(align - 1);
+		    tgt_size += (uintptr_t) hostaddrs[first]
+				- (uintptr_t) hostaddrs[i];
+		    field_tgt_base = (uintptr_t) hostaddrs[first];
+		    field_tgt_offset = tgt_size;
+		    field_tgt_clear = last;
+		    tgt_size += cur_node.host_end
+				- (uintptr_t) hostaddrs[first];
+		    continue;
+		  }
+		for (i = first; i <= last; i++)
+		  gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
+					    sizes, kinds);
+		i--;
+		continue;
+	      default:
+		break;
+	      }
 	    splay_tree_key k = &array->key;
 	    k->host_start = (uintptr_t) hostaddrs[i];
 	    if (!GOMP_MAP_POINTER_P (kind & typemask))
@@ -351,19 +565,31 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 	      k->host_end = k->host_start + sizeof (void *);
 	    splay_tree_key n = splay_tree_lookup (mem_map, k);
 	    if (n)
-	      {
-		tgt->list[i] = n;
-		gomp_map_vars_existing (devicep, n, k, kind & typemask);
-	      }
+	      gomp_map_vars_existing (devicep, n, k, &tgt->list[i],
+				      kind & typemask);
 	    else
 	      {
 		size_t align = (size_t) 1 << (kind >> rshift);
-		tgt->list[i] = k;
-		tgt_size = (tgt_size + align - 1) & ~(align - 1);
+		tgt->list[i].key = k;
 		k->tgt = tgt;
-		k->tgt_offset = tgt_size;
-		tgt_size += k->host_end - k->host_start;
-		k->copy_from = GOMP_MAP_COPY_FROM_P (kind & typemask);
+		if (field_tgt_clear != ~(size_t) 0)
+		  {
+		    k->tgt_offset = k->host_start - field_tgt_base
+				    + field_tgt_offset;
+		    if (i == field_tgt_clear)
+		      field_tgt_clear = ~(size_t) 0;
+		  }
+		else
+		  {
+		    tgt_size = (tgt_size + align - 1) & ~(align - 1);
+		    k->tgt_offset = tgt_size;
+		    tgt_size += k->host_end - k->host_start;
+		  }
+		tgt->list[i].copy_from = GOMP_MAP_COPY_FROM_P (kind & typemask);
+		tgt->list[i].always_copy_from
+		  = GOMP_MAP_ALWAYS_FROM_P (kind & typemask);
+		tgt->list[i].offset = 0;
+		tgt->list[i].length = k->host_end - k->host_start;
 		k->refcount = 1;
 		k->async_refcount = 0;
 		tgt->refcount++;
@@ -376,11 +602,14 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 		  case GOMP_MAP_FROM:
 		  case GOMP_MAP_FORCE_ALLOC:
 		  case GOMP_MAP_FORCE_FROM:
+		  case GOMP_MAP_ALWAYS_FROM:
 		    break;
 		  case GOMP_MAP_TO:
 		  case GOMP_MAP_TOFROM:
 		  case GOMP_MAP_FORCE_TO:
 		  case GOMP_MAP_FORCE_TOFROM:
+		  case GOMP_MAP_ALWAYS_TO:
+		  case GOMP_MAP_ALWAYS_TOFROM:
 		    /* FIXME: Perhaps add some smarts, like if copying
 		       several adjacent fields from host to target, use some
 		       host buffer to avoid sending each var individually.  */
@@ -403,7 +632,8 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 					    k->host_end - k->host_start);
 
 		    for (j = i + 1; j < mapnum; j++)
-		      if (!GOMP_MAP_POINTER_P (get_kind (is_openacc, kinds, j)
+		      if (!GOMP_MAP_POINTER_P (get_kind (short_mapkind, kinds,
+							 j)
 					       & typemask))
 			break;
 		      else if ((uintptr_t) hostaddrs[j] < k->host_start
@@ -412,8 +642,11 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 			break;
 		      else
 			{
-			  tgt->list[j] = k;
-			  k->refcount++;
+			  tgt->list[j].key = k;
+			  tgt->list[j].copy_from = false;
+			  tgt->list[j].always_copy_from = false;
+			  if (k->refcount != REFCOUNT_INFINITY)
+			    k->refcount++;
 			  gomp_map_pointer (tgt,
 					    (uintptr_t) *(void **) hostaddrs[j],
 					    k->tgt_offset
@@ -460,15 +693,30 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 	  }
     }
 
-  if (is_target)
+  if (pragma_kind == GOMP_MAP_VARS_TARGET)
     {
       for (i = 0; i < mapnum; i++)
 	{
-	  if (tgt->list[i] == NULL)
-	    cur_node.tgt_offset = (uintptr_t) NULL;
+	  if (tgt->list[i].key == NULL)
+	    {
+	      if (tgt->list[i].offset == ~(uintptr_t) 0)
+		cur_node.tgt_offset = (uintptr_t) hostaddrs[i];
+	      else if (tgt->list[i].offset == ~(uintptr_t) 1)
+		cur_node.tgt_offset = 0;
+	      else if (tgt->list[i].offset == ~(uintptr_t) 2)
+		cur_node.tgt_offset = tgt->list[i + 1].key->tgt->tgt_start
+				      + tgt->list[i + 1].key->tgt_offset
+				      + tgt->list[i + 1].offset
+				      + (uintptr_t) hostaddrs[i]
+				      - (uintptr_t) hostaddrs[i + 1];
+	      else
+		cur_node.tgt_offset = tgt->tgt_start
+				      + tgt->list[i].offset;
+	    }
 	  else
-	    cur_node.tgt_offset = tgt->list[i]->tgt->tgt_start
-				  + tgt->list[i]->tgt_offset;
+	    cur_node.tgt_offset = tgt->list[i].key->tgt->tgt_start
+				  + tgt->list[i].key->tgt_offset
+				  + tgt->list[i].offset;
 	  /* FIXME: see above FIXME comment.  */
 	  devicep->host2dev_func (devicep->target_id,
 				  (void *) (tgt->tgt_start
@@ -478,6 +726,15 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 	}
     }
 
+  /* If the variable from "omp target enter data" map-list was already mapped,
+     tgt is not needed.  Otherwise tgt will be freed by gomp_unmap_vars or
+     gomp_exit_data.  */
+  if (pragma_kind == GOMP_MAP_VARS_ENTER_DATA && tgt->refcount == 0)
+    {
+      free (tgt);
+      tgt = NULL;
+    }
+
   gomp_mutex_unlock (&devicep->lock);
   return tgt;
 }
@@ -508,17 +765,17 @@ gomp_copy_from_async (struct target_mem_desc *tgt)
   gomp_mutex_lock (&devicep->lock);
 
   for (i = 0; i < tgt->list_count; i++)
-    if (tgt->list[i] == NULL)
+    if (tgt->list[i].key == NULL)
       ;
-    else if (tgt->list[i]->refcount > 1)
+    else if (tgt->list[i].key->refcount > 1)
       {
-	tgt->list[i]->refcount--;
-	tgt->list[i]->async_refcount++;
+	tgt->list[i].key->refcount--;
+	tgt->list[i].key->async_refcount++;
       }
     else
       {
-	splay_tree_key k = tgt->list[i];
-	if (k->copy_from)
+	splay_tree_key k = tgt->list[i].key;
+	if (tgt->list[i].copy_from)
 	  devicep->dev2host_func (devicep->target_id, (void *) k->host_start,
 				  (void *) (k->tgt->tgt_start + k->tgt_offset),
 				  k->host_end - k->host_start);
@@ -546,25 +803,41 @@ gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
 
   size_t i;
   for (i = 0; i < tgt->list_count; i++)
-    if (tgt->list[i] == NULL)
-      ;
-    else if (tgt->list[i]->refcount > 1)
-      tgt->list[i]->refcount--;
-    else if (tgt->list[i]->async_refcount > 0)
-      tgt->list[i]->async_refcount--;
-    else
-      {
-	splay_tree_key k = tgt->list[i];
-	if (k->copy_from && do_copyfrom)
-	  devicep->dev2host_func (devicep->target_id, (void *) k->host_start,
-				  (void *) (k->tgt->tgt_start + k->tgt_offset),
-				  k->host_end - k->host_start);
-	splay_tree_remove (&devicep->mem_map, k);
-	if (k->tgt->refcount > 1)
-	  k->tgt->refcount--;
-	else
-	  gomp_unmap_tgt (k->tgt);
-      }
+    {
+      splay_tree_key k = tgt->list[i].key;
+      if (k == NULL)
+	continue;
+
+      bool do_unmap = false;
+      if (k->refcount > 1 && k->refcount != REFCOUNT_INFINITY)
+	k->refcount--;
+      else if (k->refcount == 1)
+	{
+	  if (k->async_refcount > 0)
+	    k->async_refcount--;
+	  else
+	    {
+	      k->refcount--;
+	      do_unmap = true;
+	    }
+	}
+
+      if ((do_unmap && do_copyfrom && tgt->list[i].copy_from)
+	  || tgt->list[i].always_copy_from)
+	devicep->dev2host_func (devicep->target_id,
+				(void *) (k->host_start + tgt->list[i].offset),
+				(void *) (k->tgt->tgt_start + k->tgt_offset
+					  + tgt->list[i].offset),
+				tgt->list[i].length);
+      if (do_unmap)
+	{
+	  splay_tree_remove (&devicep->mem_map, k);
+	  if (k->tgt->refcount > 1)
+	    k->tgt->refcount--;
+	  else
+	    gomp_unmap_tgt (k->tgt);
+	}
+    }
 
   if (tgt->refcount > 1)
     tgt->refcount--;
@@ -576,11 +849,11 @@ gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
 
 static void
 gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
-	     size_t *sizes, void *kinds, bool is_openacc)
+	     size_t *sizes, void *kinds, bool short_mapkind)
 {
   size_t i;
   struct splay_tree_key_s cur_node;
-  const int typemask = is_openacc ? 0xff : 0x7;
+  const int typemask = short_mapkind ? 0xff : 0x7;
 
   if (!devicep)
     return;
@@ -597,7 +870,7 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
 	splay_tree_key n = splay_tree_lookup (&devicep->mem_map, &cur_node);
 	if (n)
 	  {
-	    int kind = get_kind (is_openacc, kinds, i);
+	    int kind = get_kind (short_mapkind, kinds, i);
 	    if (n->host_start > cur_node.host_start
 		|| n->host_end < cur_node.host_end)
 	      {
@@ -626,13 +899,6 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs,
 						- n->host_start),
 				      cur_node.host_end - cur_node.host_start);
 	  }
-	else
-	  {
-	    gomp_mutex_unlock (&devicep->lock);
-	    gomp_fatal ("Trying to update [%p..%p) object that is not mapped",
-			(void *) cur_node.host_start,
-			(void *) cur_node.host_end);
-	  }
       }
   gomp_mutex_unlock (&devicep->lock);
 }
@@ -678,7 +944,7 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
   /* Insert host-target address mapping into splay tree.  */
   struct target_mem_desc *tgt = gomp_malloc (sizeof (*tgt));
   tgt->array = gomp_malloc ((num_funcs + num_vars) * sizeof (*tgt->array));
-  tgt->refcount = 1;
+  tgt->refcount = REFCOUNT_INFINITY;
   tgt->tgt_start = 0;
   tgt->tgt_end = 0;
   tgt->to_free = NULL;
@@ -694,9 +960,8 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
       k->host_end = k->host_start + 1;
       k->tgt = tgt;
       k->tgt_offset = target_table[i].start;
-      k->refcount = 1;
+      k->refcount = REFCOUNT_INFINITY;
       k->async_refcount = 0;
-      k->copy_from = false;
       array->left = NULL;
       array->right = NULL;
       splay_tree_insert (&devicep->mem_map, array);
@@ -720,9 +985,8 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
       k->host_end = k->host_start + (uintptr_t) host_var_table[i * 2 + 1];
       k->tgt = tgt;
       k->tgt_offset = target_var->start;
-      k->refcount = 1;
+      k->refcount = REFCOUNT_INFINITY;
       k->async_refcount = 0;
-      k->copy_from = false;
       array->left = NULL;
       array->right = NULL;
       splay_tree_insert (&devicep->mem_map, array);
@@ -945,6 +1209,47 @@ gomp_fini_device (struct gomp_device_descr *devicep)
   devicep->is_initialized = false;
 }
 
+/* Host fallback for GOMP_target{,_41} routines.  */
+
+static void
+gomp_target_fallback (void (*fn) (void *), void **hostaddrs)
+{
+  struct gomp_thread old_thr, *thr = gomp_thread ();
+  old_thr = *thr;
+  memset (thr, '\0', sizeof (*thr));
+  if (gomp_places_list)
+    {
+      thr->place = old_thr.place;
+      thr->ts.place_partition_len = gomp_places_list_len;
+    }
+  fn (hostaddrs);
+  gomp_free_thread (thr);
+  *thr = old_thr;
+}
+
+/* Helper function of GOMP_target{,_41} routines.  */
+
+static void *
+gomp_get_target_fn_addr (struct gomp_device_descr *devicep,
+			 void (*host_fn) (void *))
+{
+  if (devicep->capabilities & GOMP_OFFLOAD_CAP_NATIVE_EXEC)
+    return (void *) host_fn;
+  else
+    {
+      gomp_mutex_lock (&devicep->lock);
+      struct splay_tree_key_s k;
+      k.host_start = (uintptr_t) host_fn;
+      k.host_end = k.host_start + 1;
+      splay_tree_key tgt_fn = splay_tree_lookup (&devicep->mem_map, &k);
+      gomp_mutex_unlock (&devicep->lock);
+      if (tgt_fn == NULL)
+	gomp_fatal ("Target function wasn't mapped");
+
+      return (void *) tgt_fn->tgt_offset;
+    }
+}
+
 /* Called when encountering a target directive.  If DEVICE
    is GOMP_DEVICE_ICV, it means use device-var ICV.  If it is
    GOMP_DEVICE_HOST_FALLBACK (or any value
@@ -964,51 +1269,85 @@ GOMP_target (int device, void (*fn) (void *), const void *unused,
 
   if (devicep == NULL
       || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+    return gomp_target_fallback (fn, hostaddrs);
+
+  void *fn_addr = gomp_get_target_fn_addr (devicep, fn);
+
+  struct target_mem_desc *tgt_vars
+    = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, false,
+		     GOMP_MAP_VARS_TARGET);
+  struct gomp_thread old_thr, *thr = gomp_thread ();
+  old_thr = *thr;
+  memset (thr, '\0', sizeof (*thr));
+  if (gomp_places_list)
     {
-      /* Host fallback.  */
-      struct gomp_thread old_thr, *thr = gomp_thread ();
-      old_thr = *thr;
-      memset (thr, '\0', sizeof (*thr));
-      if (gomp_places_list)
-	{
-	  thr->place = old_thr.place;
-	  thr->ts.place_partition_len = gomp_places_list_len;
-	}
-      fn (hostaddrs);
-      gomp_free_thread (thr);
-      *thr = old_thr;
-      return;
+      thr->place = old_thr.place;
+      thr->ts.place_partition_len = gomp_places_list_len;
     }
+  devicep->run_func (devicep->target_id, fn_addr, (void *) tgt_vars->tgt_start);
+  gomp_free_thread (thr);
+  *thr = old_thr;
+  gomp_unmap_vars (tgt_vars, true);
+}
 
-  gomp_mutex_lock (&devicep->lock);
-  if (!devicep->is_initialized)
-    gomp_init_device (devicep);
-  gomp_mutex_unlock (&devicep->lock);
+void
+GOMP_target_41 (int device, void (*fn) (void *), size_t mapnum,
+		void **hostaddrs, size_t *sizes, unsigned short *kinds,
+		unsigned int flags, void **depend)
+{
+  struct gomp_device_descr *devicep = resolve_device (device);
 
-  void *fn_addr;
+  /* If there are depend clauses, but nowait is not present,
+     block the parent task until the dependencies are resolved
+     and then just continue with the rest of the function as if it
+     is a merged task.  */
+  if (depend != NULL)
+    {
+      struct gomp_thread *thr = gomp_thread ();
+      if (thr->task && thr->task->depend_hash)
+	gomp_task_maybe_wait_for_dependencies (depend);
+    }
 
-  if (devicep->capabilities & GOMP_OFFLOAD_CAP_NATIVE_EXEC)
-    fn_addr = (void *) fn;
-  else
+  if (devicep == NULL
+      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
     {
-      gomp_mutex_lock (&devicep->lock);
-      struct splay_tree_key_s k;
-      k.host_start = (uintptr_t) fn;
-      k.host_end = k.host_start + 1;
-      splay_tree_key tgt_fn = splay_tree_lookup (&devicep->mem_map, &k);
-      if (tgt_fn == NULL)
+      size_t i, tgt_align = 0, tgt_size = 0;
+      char *tgt = NULL;
+      for (i = 0; i < mapnum; i++)
+	if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
+	  {
+	    size_t align = (size_t) 1 << (kinds[i] >> 8);
+	    if (tgt_align < align)
+	      tgt_align = align;
+	    tgt_size = (tgt_size + align - 1) & ~(align - 1);
+	    tgt_size += sizes[i];
+	  }
+      if (tgt_align)
 	{
-	  gomp_mutex_unlock (&devicep->lock);
-	  gomp_fatal ("Target function wasn't mapped");
+	  tgt = gomp_alloca (tgt_size + tgt_align - 1);
+	  uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
+	  if (al)
+	    tgt += tgt_align - al;
+	  tgt_size = 0;
+	  for (i = 0; i < mapnum; i++)
+	    if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
+	      {
+		size_t align = (size_t) 1 << (kinds[i] >> 8);
+		tgt_size = (tgt_size + align - 1) & ~(align - 1);
+		memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]);
+		hostaddrs[i] = tgt + tgt_size;
+		tgt_size = tgt_size + sizes[i];
+	      }
 	}
-      gomp_mutex_unlock (&devicep->lock);
-
-      fn_addr = (void *) tgt_fn->tgt_offset;
+      gomp_target_fallback (fn, hostaddrs);
+      return;
     }
 
+  void *fn_addr = gomp_get_target_fn_addr (devicep, fn);
+
   struct target_mem_desc *tgt_vars
-    = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, false,
-		     true);
+    = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, true,
+		     GOMP_MAP_VARS_TARGET);
   struct gomp_thread old_thr, *thr = gomp_thread ();
   old_thr = *thr;
   memset (thr, '\0', sizeof (*thr));
@@ -1023,6 +1362,26 @@ GOMP_target (int device, void (*fn) (void *), const void *unused,
   gomp_unmap_vars (tgt_vars, true);
 }
 
+/* Host fallback for GOMP_target_data{,_41} routines.  */
+
+static void
+gomp_target_data_fallback (void)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  if (icv->target_data)
+    {
+      /* Even when doing a host fallback, if there are any active
+         #pragma omp target data constructs, need to remember the
+         new #pragma omp target data, otherwise GOMP_target_end_data
+         would get out of sync.  */
+      struct target_mem_desc *tgt
+	= gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, false,
+			 GOMP_MAP_VARS_DATA);
+      tgt->prev = icv->target_data;
+      icv->target_data = tgt;
+    }
+}
+
 void
 GOMP_target_data (int device, const void *unused, size_t mapnum,
 		  void **hostaddrs, size_t *sizes, unsigned char *kinds)
@@ -1031,31 +1390,29 @@ GOMP_target_data (int device, const void *unused, size_t mapnum,
 
   if (devicep == NULL
       || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
-    {
-      /* Host fallback.  */
-      struct gomp_task_icv *icv = gomp_icv (false);
-      if (icv->target_data)
-	{
-	  /* Even when doing a host fallback, if there are any active
-	     #pragma omp target data constructs, need to remember the
-	     new #pragma omp target data, otherwise GOMP_target_end_data
-	     would get out of sync.  */
-	  struct target_mem_desc *tgt
-	    = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, false, false);
-	  tgt->prev = icv->target_data;
-	  icv->target_data = tgt;
-	}
-      return;
-    }
-
-  gomp_mutex_lock (&devicep->lock);
-  if (!devicep->is_initialized)
-    gomp_init_device (devicep);
-  gomp_mutex_unlock (&devicep->lock);
+    return gomp_target_data_fallback ();
 
   struct target_mem_desc *tgt
     = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, false,
-		     false);
+		     GOMP_MAP_VARS_DATA);
+  struct gomp_task_icv *icv = gomp_icv (true);
+  tgt->prev = icv->target_data;
+  icv->target_data = tgt;
+}
+
+void
+GOMP_target_data_41 (int device, size_t mapnum, void **hostaddrs, size_t *sizes,
+		     unsigned short *kinds)
+{
+  struct gomp_device_descr *devicep = resolve_device (device);
+
+  if (devicep == NULL
+      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+    return gomp_target_data_fallback ();
+
+  struct target_mem_desc *tgt
+    = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, true,
+		     GOMP_MAP_VARS_DATA);
   struct gomp_task_icv *icv = gomp_icv (true);
   tgt->prev = icv->target_data;
   icv->target_data = tgt;
@@ -1083,12 +1440,230 @@ GOMP_target_update (int device, const void *unused, size_t mapnum,
       || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
     return;
 
+  gomp_update (devicep, mapnum, hostaddrs, sizes, kinds, false);
+}
+
+void
+GOMP_target_update_41 (int device, size_t mapnum, void **hostaddrs,
+		       size_t *sizes, unsigned short *kinds,
+		       unsigned int flags, void **depend)
+{
+  struct gomp_device_descr *devicep = resolve_device (device);
+
+  /* If there are depend clauses, but nowait is not present,
+     block the parent task until the dependencies are resolved
+     and then just continue with the rest of the function as if it
+     is a merged task.  Until we are able to schedule task during
+     variable mapping or unmapping, ignore nowait if depend clauses
+     are not present.  */
+  if (depend != NULL)
+    {
+      struct gomp_thread *thr = gomp_thread ();
+      if (thr->task && thr->task->depend_hash)
+	{
+	  if ((flags & GOMP_TARGET_FLAG_NOWAIT)
+	      && thr->ts.team
+	      && !thr->task->final_task)
+	    {
+	      gomp_create_target_task (devicep, (void (*) (void *)) NULL,
+				       mapnum, hostaddrs, sizes, kinds,
+				       flags | GOMP_TARGET_FLAG_UPDATE,
+				       depend);
+	      return;
+	    }
+
+	  struct gomp_team *team = thr->ts.team;
+	  /* If parallel or taskgroup has been cancelled, don't start new
+	     tasks.  */
+	  if (team
+	      && (gomp_team_barrier_cancelled (&team->barrier)
+		  || (thr->task->taskgroup
+		      && thr->task->taskgroup->cancelled)))
+	    return;
+
+	  gomp_task_maybe_wait_for_dependencies (depend);
+	}
+    }
+
+  if (devicep == NULL
+      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+    return;
+
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
+  if (team
+      && (gomp_team_barrier_cancelled (&team->barrier)
+	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
+    return;
+
+  gomp_update (devicep, mapnum, hostaddrs, sizes, kinds, true);
+}
+
+static void
+gomp_exit_data (struct gomp_device_descr *devicep, size_t mapnum,
+		void **hostaddrs, size_t *sizes, unsigned short *kinds)
+{
+  const int typemask = 0xff;
+  size_t i;
   gomp_mutex_lock (&devicep->lock);
-  if (!devicep->is_initialized)
-    gomp_init_device (devicep);
+  for (i = 0; i < mapnum; i++)
+    {
+      struct splay_tree_key_s cur_node;
+      unsigned char kind = kinds[i] & typemask;
+      switch (kind)
+	{
+	case GOMP_MAP_FROM:
+	case GOMP_MAP_ALWAYS_FROM:
+	case GOMP_MAP_DELETE:
+	case GOMP_MAP_RELEASE:
+	case GOMP_MAP_ZERO_LEN_ARRAY_SECTION:
+	case GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION:
+	  cur_node.host_start = (uintptr_t) hostaddrs[i];
+	  cur_node.host_end = cur_node.host_start + sizes[i];
+	  splay_tree_key k = (kind == GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION
+			      || kind == GOMP_MAP_ZERO_LEN_ARRAY_SECTION)
+	    ? gomp_map_lookup (&devicep->mem_map, &cur_node)
+	    : splay_tree_lookup (&devicep->mem_map, &cur_node);
+	  if (!k)
+	    continue;
+
+	  if (k->refcount > 0 && k->refcount != REFCOUNT_INFINITY)
+	    k->refcount--;
+	  if ((kind == GOMP_MAP_DELETE
+	       || kind == GOMP_MAP_DELETE_ZERO_LEN_ARRAY_SECTION)
+	      && k->refcount != REFCOUNT_INFINITY)
+	    k->refcount = 0;
+
+	  if ((kind == GOMP_MAP_FROM && k->refcount == 0)
+	      || kind == GOMP_MAP_ALWAYS_FROM)
+	    devicep->dev2host_func (devicep->target_id,
+				    (void *) cur_node.host_start,
+				    (void *) (k->tgt->tgt_start + k->tgt_offset
+					      + cur_node.host_start
+					      - k->host_start),
+				    cur_node.host_end - cur_node.host_start);
+	  if (k->refcount == 0)
+	    {
+	      splay_tree_remove (&devicep->mem_map, k);
+	      if (k->tgt->refcount > 1)
+		k->tgt->refcount--;
+	      else
+		gomp_unmap_tgt (k->tgt);
+	    }
+
+	  break;
+	default:
+	  gomp_mutex_unlock (&devicep->lock);
+	  gomp_fatal ("GOMP_target_enter_exit_data unhandled kind 0x%.2x",
+		      kind);
+	}
+    }
+
   gomp_mutex_unlock (&devicep->lock);
+}
 
-  gomp_update (devicep, mapnum, hostaddrs, sizes, kinds, false);
+void
+GOMP_target_enter_exit_data (int device, size_t mapnum, void **hostaddrs,
+			     size_t *sizes, unsigned short *kinds,
+			     unsigned int flags, void **depend)
+{
+  struct gomp_device_descr *devicep = resolve_device (device);
+
+  /* If there are depend clauses, but nowait is not present,
+     block the parent task until the dependencies are resolved
+     and then just continue with the rest of the function as if it
+     is a merged task.  Until we are able to schedule task during
+     variable mapping or unmapping, ignore nowait if depend clauses
+     are not present.  */
+  if (depend != NULL)
+    {
+      struct gomp_thread *thr = gomp_thread ();
+      if (thr->task && thr->task->depend_hash)
+	{
+	  if ((flags & GOMP_TARGET_FLAG_NOWAIT)
+	      && thr->ts.team
+	      && !thr->task->final_task)
+	    {
+	      gomp_create_target_task (devicep, (void (*) (void *)) NULL,
+				       mapnum, hostaddrs, sizes, kinds,
+				       flags, depend);
+	      return;
+	    }
+
+	  struct gomp_team *team = thr->ts.team;
+	  /* If parallel or taskgroup has been cancelled, don't start new
+	     tasks.  */
+	  if (team
+	      && (gomp_team_barrier_cancelled (&team->barrier)
+		  || (thr->task->taskgroup
+		      && thr->task->taskgroup->cancelled)))
+	    return;
+
+	  gomp_task_maybe_wait_for_dependencies (depend);
+	}
+    }
+
+  if (devicep == NULL
+      || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+    return;
+
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
+  if (team
+      && (gomp_team_barrier_cancelled (&team->barrier)
+	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
+    return;
+
+  size_t i;
+  if ((flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0)
+    for (i = 0; i < mapnum; i++)
+      if ((kinds[i] & 0xff) == GOMP_MAP_STRUCT)
+	{
+	  gomp_map_vars (devicep, sizes[i] + 1, &hostaddrs[i], NULL, &sizes[i],
+			 &kinds[i], true, GOMP_MAP_VARS_ENTER_DATA);
+	  i += sizes[i];
+	}
+      else
+	gomp_map_vars (devicep, 1, &hostaddrs[i], NULL, &sizes[i], &kinds[i],
+		       true, GOMP_MAP_VARS_ENTER_DATA);
+  else
+    gomp_exit_data (devicep, mapnum, hostaddrs, sizes, kinds);
+}
+
+void
+gomp_target_task_fn (void *data)
+{
+  struct gomp_target_task *ttask = (struct gomp_target_task *) data;
+  if (ttask->fn != NULL)
+    {
+      /* GOMP_target_41 */
+    }
+  else if (ttask->devicep == NULL
+	   || !(ttask->devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+    return;
+
+  size_t i;
+  if (ttask->flags & GOMP_TARGET_FLAG_UPDATE)
+    gomp_update (ttask->devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
+		 ttask->kinds, true);
+  else if ((ttask->flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0)
+    for (i = 0; i < ttask->mapnum; i++)
+      if ((ttask->kinds[i] & 0xff) == GOMP_MAP_STRUCT)
+	{
+	  gomp_map_vars (ttask->devicep, ttask->sizes[i] + 1,
+			 &ttask->hostaddrs[i], NULL, &ttask->sizes[i],
+			 &ttask->kinds[i], true, GOMP_MAP_VARS_ENTER_DATA);
+	  i += ttask->sizes[i];
+	}
+      else
+	gomp_map_vars (ttask->devicep, 1, &ttask->hostaddrs[i], NULL,
+		       &ttask->sizes[i], &ttask->kinds[i],
+		       true, GOMP_MAP_VARS_ENTER_DATA);
+  else
+    gomp_exit_data (ttask->devicep, ttask->mapnum, ttask->hostaddrs,
+		    ttask->sizes, ttask->kinds);
 }
 
 void
@@ -1103,6 +1678,384 @@ GOMP_teams (unsigned int num_teams, unsigned int thread_limit)
   (void) num_teams;
 }
 
+void *
+omp_target_alloc (size_t size, int device_num)
+{
+  if (device_num == GOMP_DEVICE_HOST_FALLBACK)
+    return malloc (size);
+
+  if (device_num < 0)
+    return NULL;
+
+  struct gomp_device_descr *devicep = resolve_device (device_num);
+  if (devicep == NULL)
+    return NULL;
+
+  if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+    return malloc (size);
+
+  gomp_mutex_lock (&devicep->lock);
+  void *ret = devicep->alloc_func (devicep->target_id, size);
+  gomp_mutex_unlock (&devicep->lock);
+  return ret;
+}
+
+void
+omp_target_free (void *device_ptr, int device_num)
+{
+  if (device_ptr == NULL)
+    return;
+
+  if (device_num == GOMP_DEVICE_HOST_FALLBACK)
+    {
+      free (device_ptr);
+      return;
+    }
+
+  if (device_num < 0)
+    return;
+
+  struct gomp_device_descr *devicep = resolve_device (device_num);
+  if (devicep == NULL)
+    return;
+
+  if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+    {
+      free (device_ptr);
+      return;
+    }
+
+  gomp_mutex_lock (&devicep->lock);
+  devicep->free_func (devicep->target_id, device_ptr);
+  gomp_mutex_unlock (&devicep->lock);
+}
+
+int
+omp_target_is_present (void *ptr, int device_num)
+{
+  if (ptr == NULL)
+    return 1;
+
+  if (device_num == GOMP_DEVICE_HOST_FALLBACK)
+    return 1;
+
+  if (device_num < 0)
+    return 0;
+
+  struct gomp_device_descr *devicep = resolve_device (device_num);
+  if (devicep == NULL)
+    return 0;
+
+  if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+    return 1;
+
+  gomp_mutex_lock (&devicep->lock);
+  struct splay_tree_s *mem_map = &devicep->mem_map;
+  struct splay_tree_key_s cur_node;
+
+  cur_node.host_start = (uintptr_t) ptr;
+  cur_node.host_end = cur_node.host_start;
+  splay_tree_key n = gomp_map_lookup (mem_map, &cur_node);
+  int ret = n != NULL;
+  gomp_mutex_unlock (&devicep->lock);
+  return ret;
+}
+
+int
+omp_target_memcpy (void *dst, void *src, size_t length, size_t dst_offset,
+		   size_t src_offset, int dst_device_num, int src_device_num)
+{
+  struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL;
+
+  if (dst_device_num != GOMP_DEVICE_HOST_FALLBACK)
+    {
+      if (dst_device_num < 0)
+	return EINVAL;
+
+      dst_devicep = resolve_device (dst_device_num);
+      if (dst_devicep == NULL)
+	return EINVAL;
+
+      if (!(dst_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+	dst_devicep = NULL;
+    }
+  if (src_device_num != GOMP_DEVICE_HOST_FALLBACK)
+    {
+      if (src_device_num < 0)
+	return EINVAL;
+
+      src_devicep = resolve_device (src_device_num);
+      if (src_devicep == NULL)
+	return EINVAL;
+
+      if (!(src_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+	src_devicep = NULL;
+    }
+  if (src_devicep == NULL && dst_devicep == NULL)
+    {
+      memcpy ((char *) dst + dst_offset, (char *) src + src_offset, length);
+      return 0;
+    }
+  if (src_devicep == NULL)
+    {
+      gomp_mutex_lock (&dst_devicep->lock);
+      dst_devicep->host2dev_func (dst_devicep->target_id,
+				  (char *) dst + dst_offset,
+				  (char *) src + src_offset, length);
+      gomp_mutex_unlock (&dst_devicep->lock);
+      return 0;
+    }
+  if (dst_devicep == NULL)
+    {
+      gomp_mutex_lock (&src_devicep->lock);
+      src_devicep->dev2host_func (src_devicep->target_id,
+				  (char *) dst + dst_offset,
+				  (char *) src + src_offset, length);
+      gomp_mutex_unlock (&src_devicep->lock);
+      return 0;
+    }
+  if (src_devicep == dst_devicep)
+    {
+      gomp_mutex_lock (&src_devicep->lock);
+      src_devicep->dev2dev_func (src_devicep->target_id,
+				 (char *) dst + dst_offset,
+				 (char *) src + src_offset, length);
+      gomp_mutex_unlock (&src_devicep->lock);
+      return 0;
+    }
+  return EINVAL;
+}
+
+static int
+omp_target_memcpy_rect_worker (void *dst, void *src, size_t element_size,
+			       int num_dims, const size_t *volume,
+			       const size_t *dst_offsets,
+			       const size_t *src_offsets,
+			       const size_t *dst_dimensions,
+			       const size_t *src_dimensions,
+			       struct gomp_device_descr *dst_devicep,
+			       struct gomp_device_descr *src_devicep)
+{
+  size_t dst_slice = element_size;
+  size_t src_slice = element_size;
+  size_t j, dst_off, src_off, length;
+  int i, ret;
+
+  if (num_dims == 1)
+    {
+      if (__builtin_mul_overflow (element_size, volume[0], &length)
+	  || __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off)
+	  || __builtin_mul_overflow (element_size, src_offsets[0], &src_off))
+	return EINVAL;
+      if (dst_devicep == NULL && src_devicep == NULL)
+	memcpy ((char *) dst + dst_off, (char *) src + src_off, length);
+      else if (src_devicep == NULL)
+	dst_devicep->host2dev_func (dst_devicep->target_id,
+				    (char *) dst + dst_off,
+				    (char *) src + src_off, length);
+      else if (dst_devicep == NULL)
+	src_devicep->dev2host_func (src_devicep->target_id,
+				    (char *) dst + dst_off,
+				    (char *) src + src_off, length);
+      else if (src_devicep == dst_devicep)
+	src_devicep->dev2dev_func (src_devicep->target_id,
+				   (char *) dst + dst_off,
+				   (char *) src + src_off, length);
+      else
+	return EINVAL;
+      return 0;
+    }
+
+  /* FIXME: it would be nice to have some plugin function to handle
+     num_dims == 2 and num_dims == 3 more efficiently.  Larger ones can
+     be handled in the generic recursion below, and for host-host it
+     should be used even for any num_dims >= 2.  */
+
+  for (i = 1; i < num_dims; i++)
+    if (__builtin_mul_overflow (dst_slice, dst_dimensions[i], &dst_slice)
+	|| __builtin_mul_overflow (src_slice, src_dimensions[i], &src_slice))
+      return EINVAL;
+  if (__builtin_mul_overflow (dst_slice, dst_offsets[0], &dst_off)
+      || __builtin_mul_overflow (src_slice, src_offsets[0], &src_off))
+    return EINVAL;
+  for (j = 0; j < volume[0]; j++)
+    {
+      ret = omp_target_memcpy_rect_worker ((char *) dst + dst_off,
+					   (char *) src + src_off,
+					   element_size, num_dims - 1,
+					   volume + 1, dst_offsets + 1,
+					   src_offsets + 1, dst_dimensions + 1,
+					   src_dimensions + 1, dst_devicep,
+					   src_devicep);
+      if (ret)
+	return ret;
+      dst_off += dst_slice;
+      src_off += src_slice;
+    }
+  return 0;
+}
+
+int
+omp_target_memcpy_rect (void *dst, void *src, size_t element_size,
+			int num_dims, const size_t *volume,
+			const size_t *dst_offsets,
+			const size_t *src_offsets,
+			const size_t *dst_dimensions,
+			const size_t *src_dimensions,
+			int dst_device_num, int src_device_num)
+{
+  struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL;
+
+  if (!dst && !src)
+    return INT_MAX;
+
+  if (dst_device_num != GOMP_DEVICE_HOST_FALLBACK)
+    {
+      if (dst_device_num < 0)
+	return EINVAL;
+
+      dst_devicep = resolve_device (dst_device_num);
+      if (dst_devicep == NULL)
+	return EINVAL;
+
+      if (!(dst_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+	dst_devicep = NULL;
+    }
+  if (src_device_num != GOMP_DEVICE_HOST_FALLBACK)
+    {
+      if (src_device_num < 0)
+	return EINVAL;
+
+      src_devicep = resolve_device (src_device_num);
+      if (src_devicep == NULL)
+	return EINVAL;
+
+      if (!(src_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+	src_devicep = NULL;
+    }
+
+  if (src_devicep != NULL && dst_devicep != NULL && src_devicep != dst_devicep)
+    return EINVAL;
+
+  if (src_devicep)
+    gomp_mutex_lock (&src_devicep->lock);
+  else if (dst_devicep)
+    gomp_mutex_lock (&dst_devicep->lock);
+  int ret = omp_target_memcpy_rect_worker (dst, src, element_size, num_dims,
+					   volume, dst_offsets, src_offsets,
+					   dst_dimensions, src_dimensions,
+					   dst_devicep, src_devicep);
+  if (src_devicep)
+    gomp_mutex_unlock (&src_devicep->lock);
+  else if (dst_devicep)
+    gomp_mutex_unlock (&dst_devicep->lock);
+  return ret;
+}
+
+int
+omp_target_associate_ptr (void *host_ptr, void *device_ptr, size_t size,
+			  size_t device_offset, int device_num)
+{
+  if (device_num == GOMP_DEVICE_HOST_FALLBACK)
+    return EINVAL;
+
+  if (device_num < 0)
+    return EINVAL;
+
+  struct gomp_device_descr *devicep = resolve_device (device_num);
+  if (devicep == NULL)
+    return EINVAL;
+
+  if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+    return EINVAL;
+
+  gomp_mutex_lock (&devicep->lock);
+
+  struct splay_tree_s *mem_map = &devicep->mem_map;
+  struct splay_tree_key_s cur_node;
+  int ret = EINVAL;
+
+  cur_node.host_start = (uintptr_t) host_ptr;
+  cur_node.host_end = cur_node.host_start + size;
+  splay_tree_key n = gomp_map_lookup (mem_map, &cur_node);
+  if (n)
+    {
+      if (n->tgt->tgt_start + n->tgt_offset
+	  == (uintptr_t) device_ptr + device_offset
+	  && n->host_start <= cur_node.host_start
+	  && n->host_end >= cur_node.host_end)
+	ret = 0;
+    }
+  else
+    {
+      struct target_mem_desc *tgt = gomp_malloc (sizeof (*tgt));
+      tgt->array = gomp_malloc (sizeof (*tgt->array));
+      tgt->refcount = 1;
+      tgt->tgt_start = 0;
+      tgt->tgt_end = 0;
+      tgt->to_free = NULL;
+      tgt->prev = NULL;
+      tgt->list_count = 0;
+      tgt->device_descr = devicep;
+      splay_tree_node array = tgt->array;
+      splay_tree_key k = &array->key;
+      k->host_start = cur_node.host_start;
+      k->host_end = cur_node.host_end;
+      k->tgt = tgt;
+      k->tgt_offset = (uintptr_t) device_ptr + device_offset;
+      k->refcount = REFCOUNT_INFINITY;
+      k->async_refcount = 0;
+      array->left = NULL;
+      array->right = NULL;
+      splay_tree_insert (&devicep->mem_map, array);
+      ret = 0;
+    }
+  gomp_mutex_unlock (&devicep->lock);
+  return ret;
+}
+
+int
+omp_target_disassociate_ptr (void *ptr, int device_num)
+{
+  if (device_num == GOMP_DEVICE_HOST_FALLBACK)
+    return EINVAL;
+
+  if (device_num < 0)
+    return EINVAL;
+
+  struct gomp_device_descr *devicep = resolve_device (device_num);
+  if (devicep == NULL)
+    return EINVAL;
+
+  if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+    return EINVAL;
+
+  gomp_mutex_lock (&devicep->lock);
+
+  struct splay_tree_s *mem_map = &devicep->mem_map;
+  struct splay_tree_key_s cur_node;
+  int ret = EINVAL;
+
+  cur_node.host_start = (uintptr_t) ptr;
+  cur_node.host_end = cur_node.host_start;
+  splay_tree_key n = gomp_map_lookup (mem_map, &cur_node);
+  if (n
+      && n->host_start == cur_node.host_start
+      && n->refcount == REFCOUNT_INFINITY
+      && n->tgt->tgt_start == 0
+      && n->tgt->to_free == NULL
+      && n->tgt->refcount == 1
+      && n->tgt->list_count == 0)
+    {
+      splay_tree_remove (&devicep->mem_map, n);
+      gomp_unmap_tgt (n->tgt);
+      ret = 0;
+    }
+
+  gomp_mutex_unlock (&devicep->lock);
+  return ret;
+}
+
 #ifdef PLUGIN_SUPPORT
 
 /* This function tries to load a plugin for DEVICE.  Name of plugin is passed
@@ -1153,7 +2106,10 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
   DLSYM (host2dev);
   device->capabilities = device->get_caps_func ();
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
-    DLSYM (run);
+    {
+      DLSYM (run);
+      DLSYM (dev2dev);
+    }
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
     {
       if (!DLSYM_OPT (openacc.exec, openacc_parallel)
diff --git a/libgomp/task.c b/libgomp/task.c
index 74920d5ddb8..1246c6ae318 100644
--- a/libgomp/task.c
+++ b/libgomp/task.c
@@ -29,6 +29,7 @@
 #include "libgomp.h"
 #include <stdlib.h>
 #include <string.h>
+#include "gomp-constants.h"
 
 typedef struct gomp_task_depend_entry *hash_entry_type;
 
@@ -91,6 +92,8 @@ gomp_end_task (void)
   thr->task = task->parent;
 }
 
+/* Orphan the task in CHILDREN and all its siblings.  */
+
 static inline void
 gomp_clear_parent (struct gomp_task *children)
 {
@@ -105,16 +108,136 @@ gomp_clear_parent (struct gomp_task *children)
     while (task != children);
 }
 
-static void gomp_task_maybe_wait_for_dependencies (void **depend);
+/* Helper function for GOMP_task and gomp_create_target_task.  Depend clause
+   handling for undeferred task creation.  */
+
+static void
+gomp_task_handle_depend (struct gomp_task *task, struct gomp_task *parent,
+			 void **depend)
+{
+  size_t ndepend = (uintptr_t) depend[0];
+  size_t nout = (uintptr_t) depend[1];
+  size_t i;
+  hash_entry_type ent;
+
+  task->depend_count = ndepend;
+  task->num_dependees = 0;
+  if (parent->depend_hash == NULL)
+    parent->depend_hash = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12);
+  for (i = 0; i < ndepend; i++)
+    {
+      task->depend[i].addr = depend[2 + i];
+      task->depend[i].next = NULL;
+      task->depend[i].prev = NULL;
+      task->depend[i].task = task;
+      task->depend[i].is_in = i >= nout;
+      task->depend[i].redundant = false;
+      task->depend[i].redundant_out = false;
+
+      hash_entry_type *slot = htab_find_slot (&parent->depend_hash,
+					      &task->depend[i], INSERT);
+      hash_entry_type out = NULL, last = NULL;
+      if (*slot)
+	{
+	  /* If multiple depends on the same task are the same, all but the
+	     first one are redundant.  As inout/out come first, if any of them
+	     is inout/out, it will win, which is the right semantics.  */
+	  if ((*slot)->task == task)
+	    {
+	      task->depend[i].redundant = true;
+	      continue;
+	    }
+	  for (ent = *slot; ent; ent = ent->next)
+	    {
+	      if (ent->redundant_out)
+		break;
+
+	      last = ent;
+
+	      /* depend(in:...) doesn't depend on earlier depend(in:...).  */
+	      if (i >= nout && ent->is_in)
+		continue;
+
+	      if (!ent->is_in)
+		out = ent;
+
+	      struct gomp_task *tsk = ent->task;
+	      if (tsk->dependers == NULL)
+		{
+		  tsk->dependers
+		    = gomp_malloc (sizeof (struct gomp_dependers_vec)
+				   + 6 * sizeof (struct gomp_task *));
+		  tsk->dependers->n_elem = 1;
+		  tsk->dependers->allocated = 6;
+		  tsk->dependers->elem[0] = task;
+		  task->num_dependees++;
+		  continue;
+		}
+	      /* We already have some other dependency on tsk from earlier
+		 depend clause.  */
+	      else if (tsk->dependers->n_elem
+		       && (tsk->dependers->elem[tsk->dependers->n_elem - 1]
+			   == task))
+		continue;
+	      else if (tsk->dependers->n_elem == tsk->dependers->allocated)
+		{
+		  tsk->dependers->allocated
+		    = tsk->dependers->allocated * 2 + 2;
+		  tsk->dependers
+		    = gomp_realloc (tsk->dependers,
+				    sizeof (struct gomp_dependers_vec)
+				    + (tsk->dependers->allocated
+				       * sizeof (struct gomp_task *)));
+		}
+	      tsk->dependers->elem[tsk->dependers->n_elem++] = task;
+	      task->num_dependees++;
+	    }
+	  task->depend[i].next = *slot;
+	  (*slot)->prev = &task->depend[i];
+	}
+      *slot = &task->depend[i];
+
+      /* There is no need to store more than one depend({,in}out:) task per
+	 address in the hash table chain for the purpose of creation of
+	 deferred tasks, because each out depends on all earlier outs, thus it
+	 is enough to record just the last depend({,in}out:).  For depend(in:),
+	 we need to keep all of the previous ones not terminated yet, because
+	 a later depend({,in}out:) might need to depend on all of them.  So, if
+	 the new task's clause is depend({,in}out:), we know there is at most
+	 one other depend({,in}out:) clause in the list (out).  For
+	 non-deferred tasks we want to see all outs, so they are moved to the
+	 end of the chain, after first redundant_out entry all following
+	 entries should be redundant_out.  */
+      if (!task->depend[i].is_in && out)
+	{
+	  if (out != last)
+	    {
+	      out->next->prev = out->prev;
+	      out->prev->next = out->next;
+	      out->next = last->next;
+	      out->prev = last;
+	      last->next = out;
+	      if (out->next)
+		out->next->prev = out;
+	    }
+	  out->redundant_out = true;
+	}
+    }
+}
 
 /* Called when encountering an explicit task directive.  If IF_CLAUSE is
    false, then we must not delay in executing the task.  If UNTIED is true,
-   then the task may be executed by any member of the team.  */
+   then the task may be executed by any member of the team.
+
+   DEPEND is an array containing:
+	depend[0]: number of depend elements.
+	depend[1]: number of depend elements of type "out".
+	depend[2..N+1]: address of [1..N]th depend element.  */
 
 void
 GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
 	   long arg_size, long arg_align, bool if_clause, unsigned flags,
-	   void **depend)
+	   void **depend, int priority)
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
@@ -126,8 +249,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
      might be running on different thread than FN.  */
   if (cpyfn)
     if_clause = false;
-  if (flags & 1)
-    flags &= ~1;
+  flags &= ~GOMP_TASK_FLAG_UNTIED;
 #endif
 
   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
@@ -136,6 +258,11 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
 	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
     return;
 
+  if ((flags & GOMP_TASK_FLAG_PRIORITY) == 0)
+    priority = 0;
+  /* FIXME, use priority.  */
+  (void) priority;
+
   if (!if_clause || team == NULL
       || (thr->task && thr->task->final_task)
       || team->task_count > 64 * team->nthreads)
@@ -148,12 +275,14 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
 	 depend clauses for non-deferred tasks other than this, because
 	 the parent task is suspended until the child task finishes and thus
 	 it can't start further child tasks.  */
-      if ((flags & 8) && thr->task && thr->task->depend_hash)
+      if ((flags & GOMP_TASK_FLAG_DEPEND)
+	  && thr->task && thr->task->depend_hash)
 	gomp_task_maybe_wait_for_dependencies (depend);
 
       gomp_init_task (&task, thr->task, gomp_icv (false));
-      task.kind = GOMP_TASK_IFFALSE;
-      task.final_task = (thr->task && thr->task->final_task) || (flags & 2);
+      task.kind = GOMP_TASK_UNDEFERRED;
+      task.final_task = (thr->task && thr->task->final_task)
+			|| (flags & GOMP_TASK_FLAG_FINAL);
       if (thr->task)
 	{
 	  task.in_tied_task = thr->task->in_tied_task;
@@ -196,7 +325,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
       bool do_wake;
       size_t depend_size = 0;
 
-      if (flags & 8)
+      if (flags & GOMP_TASK_FLAG_DEPEND)
 	depend_size = ((uintptr_t) depend[0]
 		       * sizeof (struct gomp_task_depend_entry));
       task = gomp_malloc (sizeof (*task) + depend_size
@@ -204,7 +333,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
       arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1)
 		      & ~(uintptr_t) (arg_align - 1));
       gomp_init_task (task, parent, gomp_icv (false));
-      task->kind = GOMP_TASK_IFFALSE;
+      task->kind = GOMP_TASK_UNDEFERRED;
       task->in_tied_task = parent->in_tied_task;
       task->taskgroup = taskgroup;
       thr->task = task;
@@ -219,7 +348,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
       task->kind = GOMP_TASK_WAITING;
       task->fn = fn;
       task->fn_data = arg;
-      task->final_task = (flags & 2) >> 1;
+      task->final_task = (flags & GOMP_TASK_FLAG_FINAL) >> 1;
       gomp_mutex_lock (&team->task_lock);
       /* If parallel or taskgroup has been cancelled, don't start new
 	 tasks.  */
@@ -236,123 +365,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
 	taskgroup->num_children++;
       if (depend_size)
 	{
-	  size_t ndepend = (uintptr_t) depend[0];
-	  size_t nout = (uintptr_t) depend[1];
-	  size_t i;
-	  hash_entry_type ent;
-
-	  task->depend_count = ndepend;
-	  task->num_dependees = 0;
-	  if (parent->depend_hash == NULL)
-	    parent->depend_hash
-	      = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12);
-	  for (i = 0; i < ndepend; i++)
-	    {
-	      task->depend[i].addr = depend[2 + i];
-	      task->depend[i].next = NULL;
-	      task->depend[i].prev = NULL;
-	      task->depend[i].task = task;
-	      task->depend[i].is_in = i >= nout;
-	      task->depend[i].redundant = false;
-	      task->depend[i].redundant_out = false;
-
-	      hash_entry_type *slot
-		= htab_find_slot (&parent->depend_hash, &task->depend[i],
-				  INSERT);
-	      hash_entry_type out = NULL, last = NULL;
-	      if (*slot)
-		{
-		  /* If multiple depends on the same task are the
-		     same, all but the first one are redundant.
-		     As inout/out come first, if any of them is
-		     inout/out, it will win, which is the right
-		     semantics.  */
-		  if ((*slot)->task == task)
-		    {
-		      task->depend[i].redundant = true;
-		      continue;
-		    }
-		  for (ent = *slot; ent; ent = ent->next)
-		    {
-		      if (ent->redundant_out)
-			break;
-
-		      last = ent;
-
-		      /* depend(in:...) doesn't depend on earlier
-			 depend(in:...).  */
-		      if (i >= nout && ent->is_in)
-			continue;
-
-		      if (!ent->is_in)
-			out = ent;
-
-		      struct gomp_task *tsk = ent->task;
-		      if (tsk->dependers == NULL)
-			{
-			  tsk->dependers
-			    = gomp_malloc (sizeof (struct gomp_dependers_vec)
-					   + 6 * sizeof (struct gomp_task *));
-			  tsk->dependers->n_elem = 1;
-			  tsk->dependers->allocated = 6;
-			  tsk->dependers->elem[0] = task;
-			  task->num_dependees++;
-			  continue;
-			}
-		      /* We already have some other dependency on tsk
-			 from earlier depend clause.  */
-		      else if (tsk->dependers->n_elem
-			       && (tsk->dependers->elem[tsk->dependers->n_elem
-							- 1]
-				   == task))
-			continue;
-		      else if (tsk->dependers->n_elem
-			       == tsk->dependers->allocated)
-			{
-			  tsk->dependers->allocated
-			    = tsk->dependers->allocated * 2 + 2;
-			  tsk->dependers
-			    = gomp_realloc (tsk->dependers,
-					    sizeof (struct gomp_dependers_vec)
-					    + (tsk->dependers->allocated
-					       * sizeof (struct gomp_task *)));
-			}
-		      tsk->dependers->elem[tsk->dependers->n_elem++] = task;
-		      task->num_dependees++;
-		    }
-		  task->depend[i].next = *slot;
-		  (*slot)->prev = &task->depend[i];
-		}
-	      *slot = &task->depend[i];
-
-	      /* There is no need to store more than one depend({,in}out:)
-		 task per address in the hash table chain for the purpose
-		 of creation of deferred tasks, because each out
-		 depends on all earlier outs, thus it is enough to record
-		 just the last depend({,in}out:).  For depend(in:), we need
-		 to keep all of the previous ones not terminated yet, because
-		 a later depend({,in}out:) might need to depend on all of
-		 them.  So, if the new task's clause is depend({,in}out:),
-		 we know there is at most one other depend({,in}out:) clause
-		 in the list (out).  For non-deferred tasks we want to see
-		 all outs, so they are moved to the end of the chain,
-		 after first redundant_out entry all following entries
-		 should be redundant_out.  */
-	      if (!task->depend[i].is_in && out)
-		{
-		  if (out != last)
-		    {
-		      out->next->prev = out->prev;
-		      out->prev->next = out->next;
-		      out->next = last->next;
-		      out->prev = last;
-		      last->next = out;
-		      if (out->next)
-			out->next->prev = out;
-		    }
-		  out->redundant_out = true;
-		}
-	    }
+	  gomp_task_handle_depend (task, parent, depend);
 	  if (task->num_dependees)
 	    {
 	      gomp_mutex_unlock (&team->task_lock);
@@ -374,6 +387,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
       parent->children = task;
       if (taskgroup)
 	{
+	  /* If applicable, place task into its taskgroup.  */
 	  if (taskgroup->children)
 	    {
 	      task->next_taskgroup = taskgroup->children;
@@ -412,26 +426,340 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
     }
 }
 
+ialias (GOMP_taskgroup_start)
+ialias (GOMP_taskgroup_end)
+
+#define TYPE long
+#define UTYPE unsigned long
+#define TYPE_is_long 1
+#include "taskloop.c"
+#undef TYPE
+#undef UTYPE
+#undef TYPE_is_long
+
+#define TYPE unsigned long long
+#define UTYPE TYPE
+#define GOMP_taskloop GOMP_taskloop_ull
+#include "taskloop.c"
+#undef TYPE
+#undef UTYPE
+#undef GOMP_taskloop
+
+/* Called for nowait target tasks.  */
+
+void
+gomp_create_target_task (struct gomp_device_descr *devicep,
+			 void (*fn) (void *), size_t mapnum, void **hostaddrs,
+			 size_t *sizes, unsigned short *kinds,
+			 unsigned int flags, void **depend)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+
+  /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
+  if (team
+      && (gomp_team_barrier_cancelled (&team->barrier)
+	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
+    return;
+
+  struct gomp_target_task *ttask;
+  struct gomp_task *task;
+  struct gomp_task *parent = thr->task;
+  struct gomp_taskgroup *taskgroup = parent->taskgroup;
+  bool do_wake;
+  size_t depend_size = 0;
+
+  if (depend != NULL)
+    depend_size = ((uintptr_t) depend[0]
+		   * sizeof (struct gomp_task_depend_entry));
+  task = gomp_malloc (sizeof (*task) + depend_size
+		      + sizeof (*ttask)
+		      + mapnum * (sizeof (void *) + sizeof (size_t)
+				  + sizeof (unsigned short)));
+  gomp_init_task (task, parent, gomp_icv (false));
+  task->kind = GOMP_TASK_WAITING;
+  task->in_tied_task = parent->in_tied_task;
+  task->taskgroup = taskgroup;
+  ttask = (struct gomp_target_task *) &task->depend[(uintptr_t) depend[0]];
+  ttask->devicep = devicep;
+  ttask->fn = fn;
+  ttask->mapnum = mapnum;
+  memcpy (ttask->hostaddrs, hostaddrs, mapnum * sizeof (void *));
+  ttask->sizes = (size_t *) &ttask->hostaddrs[mapnum];
+  memcpy (ttask->sizes, sizes, mapnum * sizeof (size_t));
+  ttask->kinds = (unsigned short *) &ttask->sizes[mapnum];
+  memcpy (ttask->kinds, kinds, mapnum * sizeof (unsigned short));
+  ttask->flags = flags;
+  task->fn = gomp_target_task_fn;
+  task->fn_data = ttask;
+  task->final_task = 0;
+  gomp_mutex_lock (&team->task_lock);
+  /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
+  if (__builtin_expect (gomp_team_barrier_cancelled (&team->barrier)
+			|| (taskgroup && taskgroup->cancelled), 0))
+    {
+      gomp_mutex_unlock (&team->task_lock);
+      gomp_finish_task (task);
+      free (task);
+      return;
+    }
+  if (taskgroup)
+    taskgroup->num_children++;
+  if (depend_size)
+    {
+      gomp_task_handle_depend (task, parent, depend);
+      if (task->num_dependees)
+	{
+	  gomp_mutex_unlock (&team->task_lock);
+	  return;
+	}
+    }
+  if (parent->children)
+    {
+      task->next_child = parent->children;
+      task->prev_child = parent->children->prev_child;
+      task->next_child->prev_child = task;
+      task->prev_child->next_child = task;
+    }
+  else
+    {
+      task->next_child = task;
+      task->prev_child = task;
+    }
+  parent->children = task;
+  if (taskgroup)
+    {
+      /* If applicable, place task into its taskgroup.  */
+      if (taskgroup->children)
+	{
+	  task->next_taskgroup = taskgroup->children;
+	  task->prev_taskgroup = taskgroup->children->prev_taskgroup;
+	  task->next_taskgroup->prev_taskgroup = task;
+	  task->prev_taskgroup->next_taskgroup = task;
+	}
+      else
+	{
+	  task->next_taskgroup = task;
+	  task->prev_taskgroup = task;
+	}
+      taskgroup->children = task;
+    }
+  if (team->task_queue)
+    {
+      task->next_queue = team->task_queue;
+      task->prev_queue = team->task_queue->prev_queue;
+      task->next_queue->prev_queue = task;
+      task->prev_queue->next_queue = task;
+    }
+  else
+    {
+      task->next_queue = task;
+      task->prev_queue = task;
+      team->task_queue = task;
+    }
+  ++team->task_count;
+  ++team->task_queued_count;
+  gomp_team_barrier_set_task_pending (&team->barrier);
+  do_wake = team->task_running_count + !parent->in_tied_task
+	    < team->nthreads;
+  gomp_mutex_unlock (&team->task_lock);
+  if (do_wake)
+    gomp_team_barrier_wake (&team->barrier, 1);
+}
+
+#if _LIBGOMP_CHECKING
+/* Sanity check TASK to make sure it is in its parent's children
+   queue, and that the tasks therein are in the right order.
+
+   The expected order is:
+	parent_depends_on WAITING tasks
+	!parent_depends_on WAITING tasks
+	TIED tasks
+
+   PARENT is the alleged parent of TASK.  */
+
+static void
+verify_children_queue (struct gomp_task *task, struct gomp_task *parent)
+{
+  if (task->parent != parent)
+    gomp_fatal ("verify_children_queue: incompatible parents");
+  /* It's OK, Annie was an orphan and she turned out all right.  */
+  if (!parent)
+    return;
+
+  bool seen_tied = false;
+  bool seen_plain_waiting = false;
+  bool found = false;
+  struct gomp_task *t = parent->children;
+  while (1)
+    {
+      if (t == task)
+	found = true;
+      if (seen_tied && t->kind == GOMP_TASK_WAITING)
+	gomp_fatal ("verify_children_queue: WAITING task after TIED");
+      if (t->kind == GOMP_TASK_TIED)
+	seen_tied = true;
+      else if (t->kind == GOMP_TASK_WAITING)
+	{
+	  if (t->parent_depends_on)
+	    {
+	      if (seen_plain_waiting)
+		gomp_fatal ("verify_children_queue: parent_depends_on after "
+			    "!parent_depends_on");
+	    }
+	  else
+	    seen_plain_waiting = true;
+	}
+      t = t->next_child;
+      if (t == parent->children)
+	break;
+    }
+  if (!found)
+    gomp_fatal ("verify_children_queue: child not found in parent queue");
+}
+
+/* Sanity check TASK to make sure it is in its taskgroup queue (if
+   applicable), and that the tasks therein are in the right order.
+
+   The expected order is that GOMP_TASK_WAITING tasks must come before
+   GOMP_TASK_TIED tasks.
+
+   TASK is the task.  */
+
+static void
+verify_taskgroup_queue (struct gomp_task *task)
+{
+  struct gomp_taskgroup *taskgroup = task->taskgroup;
+  if (!taskgroup)
+    return;
+
+  bool seen_tied = false;
+  bool found = false;
+  struct gomp_task *t = taskgroup->children;
+  while (1)
+    {
+      if (t == task)
+	found = true;
+      if (t->kind == GOMP_TASK_WAITING && seen_tied)
+	gomp_fatal ("verify_taskgroup_queue: WAITING task after TIED");
+      if (t->kind == GOMP_TASK_TIED)
+	seen_tied = true;
+      t = t->next_taskgroup;
+      if (t == taskgroup->children)
+	break;
+    }
+  if (!found)
+    gomp_fatal ("verify_taskgroup_queue: child not found in parent queue");
+}
+
+/* Verify that TASK is in the team's task queue.  */
+
+static void
+verify_task_queue (struct gomp_task *task, struct gomp_team *team)
+{
+  struct gomp_task *t = team->task_queue;
+  if (team)
+    while (1)
+      {
+	if (t == task)
+	  return;
+	t = t->next_queue;
+	if (t == team->task_queue)
+	  break;
+      }
+  gomp_fatal ("verify_team_queue: child not in team");
+}
+#endif
+
 static inline bool
 gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent,
-		   struct gomp_taskgroup *taskgroup, struct gomp_team *team)
+		   struct gomp_team *team)
 {
+#if _LIBGOMP_CHECKING
+  verify_children_queue (child_task, parent);
+  verify_taskgroup_queue (child_task);
+  verify_task_queue (child_task, team);
+#endif
+
   if (parent)
     {
+      /* Adjust children such that it will point to a next child,
+	 while the current one is scheduled to be executed.  This way,
+	 GOMP_taskwait (and others) can schedule a next task while
+	 waiting.
+
+	 Do not remove it entirely from the circular list, as it is
+	 still a child, though not one we should consider first (say
+	 by GOMP_taskwait).  */
       if (parent->children == child_task)
 	parent->children = child_task->next_child;
+      /* TIED tasks cannot come before WAITING tasks.  If we're about
+	 to make this task TIED, rewire things appropriately.
+	 However, a TIED task at the end is perfectly fine.  */
+      else if (child_task->next_child->kind == GOMP_TASK_WAITING
+	       && child_task->next_child != parent->children)
+	{
+	  /* Remove from the list.  */
+	  child_task->prev_child->next_child = child_task->next_child;
+	  child_task->next_child->prev_child = child_task->prev_child;
+	  /* Rewire at the end of its siblings.  */
+	  child_task->next_child = parent->children;
+	  child_task->prev_child = parent->children->prev_child;
+	  parent->children->prev_child->next_child = child_task;
+	  parent->children->prev_child = child_task;
+	}
+
+      /* If the current task (child_task) is at the top of the
+	 parent's last_parent_depends_on, it's about to be removed
+	 from it.  Adjust last_parent_depends_on appropriately.  */
       if (__builtin_expect (child_task->parent_depends_on, 0)
 	  && parent->taskwait->last_parent_depends_on == child_task)
 	{
+	  /* The last_parent_depends_on list was built with all
+	     parent_depends_on entries linked to the prev_child.  Grab
+	     the next last_parent_depends_on head from this prev_child if
+	     available...  */
 	  if (child_task->prev_child->kind == GOMP_TASK_WAITING
 	      && child_task->prev_child->parent_depends_on)
 	    parent->taskwait->last_parent_depends_on = child_task->prev_child;
 	  else
-	    parent->taskwait->last_parent_depends_on = NULL;
+	    {
+	      /* ...otherwise, there are no more parent_depends_on
+		 entries waiting to run.  In which case, clear the
+		 list.  */
+	      parent->taskwait->last_parent_depends_on = NULL;
+	    }
 	}
     }
-  if (taskgroup && taskgroup->children == child_task)
-    taskgroup->children = child_task->next_taskgroup;
+
+  /* Adjust taskgroup to point to the next taskgroup.  See note above
+     regarding adjustment of children as to why the child_task is not
+     removed entirely from the circular list.  */
+  struct gomp_taskgroup *taskgroup = child_task->taskgroup;
+  if (taskgroup)
+    {
+      if (taskgroup->children == child_task)
+	taskgroup->children = child_task->next_taskgroup;
+      /* TIED tasks cannot come before WAITING tasks.  If we're about
+	 to make this task TIED, rewire things appropriately.
+	 However, a TIED task at the end is perfectly fine.  */
+      else if (child_task->next_taskgroup->kind == GOMP_TASK_WAITING
+	       && child_task->next_taskgroup != taskgroup->children)
+	{
+	  /* Remove from the list.  */
+	  child_task->prev_taskgroup->next_taskgroup
+	    = child_task->next_taskgroup;
+	  child_task->next_taskgroup->prev_taskgroup
+	    = child_task->prev_taskgroup;
+	  /* Rewire at the end of its taskgroup.  */
+	  child_task->next_taskgroup = taskgroup->children;
+	  child_task->prev_taskgroup = taskgroup->children->prev_taskgroup;
+	  taskgroup->children->prev_taskgroup->next_taskgroup = child_task;
+	  taskgroup->children->prev_taskgroup = child_task;
+	}
+    }
+
+  /* Remove child_task from the task_queue.  */
   child_task->prev_queue->next_queue = child_task->next_queue;
   child_task->next_queue->prev_queue = child_task->prev_queue;
   if (team->task_queue == child_task)
@@ -442,6 +770,7 @@ gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent,
 	team->task_queue = NULL;
     }
   child_task->kind = GOMP_TASK_TIED;
+
   if (--team->task_queued_count == 0)
     gomp_team_barrier_clear_task_pending (&team->barrier);
   if ((gomp_team_barrier_cancelled (&team->barrier)
@@ -479,6 +808,11 @@ gomp_task_run_post_handle_depend_hash (struct gomp_task *child_task)
       }
 }
 
+/* After CHILD_TASK has been run, adjust the various task queues to
+   give higher priority to the tasks that depend on CHILD_TASK.
+
+   TEAM is the team to which CHILD_TASK belongs to.  */
+
 static size_t
 gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
 				     struct gomp_team *team)
@@ -502,6 +836,7 @@ gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
 	      if (parent->taskwait && parent->taskwait->last_parent_depends_on
 		  && !task->parent_depends_on)
 		{
+		  /* Put depender in last_parent_depends_on.  */
 		  struct gomp_task *last_parent_depends_on
 		    = parent->taskwait->last_parent_depends_on;
 		  task->next_child = last_parent_depends_on->next_child;
@@ -509,6 +844,8 @@ gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
 		}
 	      else
 		{
+		  /* Make depender a sibling of child_task, and place
+		     it at the top of said sibling list.  */
 		  task->next_child = parent->children;
 		  task->prev_child = parent->children->prev_child;
 		  parent->children = task;
@@ -518,6 +855,7 @@ gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
 	    }
 	  else
 	    {
+	      /* Make depender a sibling of child_task.  */
 	      task->next_child = task;
 	      task->prev_child = task;
 	      parent->children = task;
@@ -539,6 +877,8 @@ gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
 		parent->taskwait->last_parent_depends_on = task;
 	    }
 	}
+      /* If depender is in a taskgroup, put it at the TOP of its
+	 taskgroup.  */
       if (taskgroup)
 	{
 	  if (taskgroup->children)
@@ -560,6 +900,8 @@ gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
 	      gomp_sem_post (&taskgroup->taskgroup_sem);
 	    }
 	}
+      /* Put depender of child_task at the END of the team's
+	 task_queue.  */
       if (team->task_queue)
 	{
 	  task->next_queue = team->task_queue;
@@ -602,12 +944,18 @@ gomp_task_run_post_handle_depend (struct gomp_task *child_task,
   return gomp_task_run_post_handle_dependers (child_task, team);
 }
 
+/* Remove CHILD_TASK from its parent.  */
+
 static inline void
 gomp_task_run_post_remove_parent (struct gomp_task *child_task)
 {
   struct gomp_task *parent = child_task->parent;
   if (parent == NULL)
     return;
+
+  /* If this was the last task the parent was depending on,
+     synchronize with gomp_task_maybe_wait_for_dependencies so it can
+     clean up and return.  */
   if (__builtin_expect (child_task->parent_depends_on, 0)
       && --parent->taskwait->n_depend == 0
       && parent->taskwait->in_depend_wait)
@@ -615,6 +963,8 @@ gomp_task_run_post_remove_parent (struct gomp_task *child_task)
       parent->taskwait->in_depend_wait = false;
       gomp_sem_post (&parent->taskwait->taskwait_sem);
     }
+
+  /* Remove CHILD_TASK from its sibling list.  */
   child_task->prev_child->next_child = child_task->next_child;
   child_task->next_child->prev_child = child_task->prev_child;
   if (parent->children != child_task)
@@ -637,6 +987,8 @@ gomp_task_run_post_remove_parent (struct gomp_task *child_task)
     }
 }
 
+/* Remove CHILD_TASK from its taskgroup.  */
+
 static inline void
 gomp_task_run_post_remove_taskgroup (struct gomp_task *child_task)
 {
@@ -701,7 +1053,7 @@ gomp_barrier_handle_tasks (gomp_barrier_state_t state)
 	{
 	  child_task = team->task_queue;
 	  cancelled = gomp_task_run_pre (child_task, child_task->parent,
-					 child_task->taskgroup, team);
+					 team);
 	  if (__builtin_expect (cancelled, 0))
 	    {
 	      if (to_free)
@@ -766,7 +1118,9 @@ gomp_barrier_handle_tasks (gomp_barrier_state_t state)
     }
 }
 
-/* Called when encountering a taskwait directive.  */
+/* Called when encountering a taskwait directive.
+
+   Wait for all children of the current task.  */
 
 void
 GOMP_taskwait (void)
@@ -812,8 +1166,7 @@ GOMP_taskwait (void)
 	{
 	  child_task = task->children;
 	  cancelled
-	    = gomp_task_run_pre (child_task, task, child_task->taskgroup,
-				 team);
+	    = gomp_task_run_pre (child_task, task, team);
 	  if (__builtin_expect (cancelled, 0))
 	    {
 	      if (to_free)
@@ -863,6 +1216,9 @@ GOMP_taskwait (void)
 	 finish_cancelled:;
 	  size_t new_tasks
 	    = gomp_task_run_post_handle_depend (child_task, team);
+
+	  /* Remove child_task from children list, and set up the next
+	     sibling to be run.  */
 	  child_task->prev_child->next_child = child_task->next_child;
 	  child_task->next_child->prev_child = child_task->prev_child;
 	  if (task->children == child_task)
@@ -872,8 +1228,12 @@ GOMP_taskwait (void)
 	      else
 		task->children = NULL;
 	    }
+	  /* Orphan all the children of CHILD_TASK.  */
 	  gomp_clear_parent (child_task->children);
+
+	  /* Remove CHILD_TASK from its taskgroup.  */
 	  gomp_task_run_post_remove_taskgroup (child_task);
+
 	  to_free = child_task;
 	  child_task = NULL;
 	  team->task_count--;
@@ -889,9 +1249,11 @@ GOMP_taskwait (void)
 }
 
 /* This is like GOMP_taskwait, but we only wait for tasks that the
-   upcoming task depends on.  */
+   upcoming task depends on.
 
-static void
+   DEPEND is as in GOMP_task.  */
+
+void
 gomp_task_maybe_wait_for_dependencies (void **depend)
 {
   struct gomp_thread *thr = gomp_thread ();
@@ -923,11 +1285,33 @@ gomp_task_maybe_wait_for_dependencies (void **depend)
 	      {
 		tsk->parent_depends_on = true;
 		++num_awaited;
+		/* If a task we need to wait for is not already
+		   running and is ready to be scheduled, move it to
+		   front, so that we run it as soon as possible.
+
+		   We rearrange the children queue such that all
+		   parent_depends_on tasks are first, and
+		   last_parent_depends_on points to the last such task
+		   we rearranged.  For example, given the following
+		   children where PD[123] are the parent_depends_on
+		   tasks:
+
+			task->children
+			|
+			V
+			C1 -> C2 -> C3 -> PD1 -> PD2 -> PD3 -> C4
+
+		   We rearrange such that:
+
+			task->children
+			|	       +--- last_parent_depends_on
+			|	       |
+			V	       V
+			PD1 -> PD2 -> PD3 -> C1 -> C2 -> C3 -> C4
+		*/
+
 		if (tsk->num_dependees == 0 && tsk->kind == GOMP_TASK_WAITING)
 		  {
-		    /* If a task we need to wait for is not already
-		       running and is ready to be scheduled, move it
-		       to front, so that we run it as soon as possible.  */
 		    if (last_parent_depends_on)
 		      {
 			tsk->prev_child->next_child = tsk->next_child;
@@ -941,8 +1325,8 @@ gomp_task_maybe_wait_for_dependencies (void **depend)
 		      {
 			tsk->prev_child->next_child = tsk->next_child;
 			tsk->next_child->prev_child = tsk->prev_child;
-			tsk->prev_child = task->children;
-			tsk->next_child = task->children->next_child;
+			tsk->prev_child = task->children->prev_child;
+			tsk->next_child = task->children;
 			task->children = tsk;
 			tsk->prev_child->next_child = tsk;
 			tsk->next_child->prev_child = tsk;
@@ -983,8 +1367,7 @@ gomp_task_maybe_wait_for_dependencies (void **depend)
 	{
 	  child_task = task->children;
 	  cancelled
-	    = gomp_task_run_pre (child_task, task, child_task->taskgroup,
-				 team);
+	    = gomp_task_run_pre (child_task, task, team);
 	  if (__builtin_expect (cancelled, 0))
 	    {
 	      if (to_free)
@@ -1028,6 +1411,8 @@ gomp_task_maybe_wait_for_dependencies (void **depend)
 	    = gomp_task_run_post_handle_depend (child_task, team);
 	  if (child_task->parent_depends_on)
 	    --taskwait.n_depend;
+
+	  /* Remove child_task from sibling list.  */
 	  child_task->prev_child->next_child = child_task->next_child;
 	  child_task->next_child->prev_child = child_task->prev_child;
 	  if (task->children == child_task)
@@ -1037,6 +1422,7 @@ gomp_task_maybe_wait_for_dependencies (void **depend)
 	      else
 		task->children = NULL;
 	    }
+
 	  gomp_clear_parent (child_task->children);
 	  gomp_task_run_post_remove_taskgroup (child_task);
 	  to_free = child_task;
@@ -1070,7 +1456,7 @@ GOMP_taskgroup_start (void)
   struct gomp_taskgroup *taskgroup;
 
   /* If team is NULL, all tasks are executed as
-     GOMP_TASK_IFFALSE tasks and thus all children tasks of
+     GOMP_TASK_UNDEFERRED tasks and thus all children tasks of
      taskgroup and their descendant tasks will be finished
      by the time GOMP_taskgroup_end is called.  */
   if (team == NULL)
@@ -1137,8 +1523,7 @@ GOMP_taskgroup_end (void)
       if (child_task->kind == GOMP_TASK_WAITING)
 	{
 	  cancelled
-	    = gomp_task_run_pre (child_task, child_task->parent, taskgroup,
-				 team);
+	    = gomp_task_run_pre (child_task, child_task->parent, team);
 	  if (__builtin_expect (cancelled, 0))
 	    {
 	      if (to_free)
diff --git a/libgomp/taskloop.c b/libgomp/taskloop.c
new file mode 100644
index 00000000000..f57a5a16ef2
--- /dev/null
+++ b/libgomp/taskloop.c
@@ -0,0 +1,363 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+   Contributed by Jakub Jelinek <jakub@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file handles the taskloop construct.  It is included twice, once
+   for the long and once for unsigned long long variant.  */
+
+/* Called when encountering an explicit task directive.  If IF_CLAUSE is
+   false, then we must not delay in executing the task.  If UNTIED is true,
+   then the task may be executed by any member of the team.  */
+
+void
+GOMP_taskloop (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
+	       long arg_size, long arg_align, unsigned flags,
+	       unsigned long num_tasks, int priority,
+	       TYPE start, TYPE end, TYPE step)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+
+#ifdef HAVE_BROKEN_POSIX_SEMAPHORES
+  /* If pthread_mutex_* is used for omp_*lock*, then each task must be
+     tied to one thread all the time.  This means UNTIED tasks must be
+     tied and if CPYFN is non-NULL IF(0) must be forced, as CPYFN
+     might be running on different thread than FN.  */
+  if (cpyfn)
+    flags &= ~GOMP_TASK_FLAG_IF;
+  flags &= ~GOMP_TASK_FLAG_UNTIED;
+#endif
+
+  /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
+  if (team && gomp_team_barrier_cancelled (&team->barrier))
+    return;
+
+#ifdef TYPE_is_long
+  TYPE s = step;
+  if (step > 0)
+    {
+      if (start >= end)
+	return;
+      s--;
+    }
+  else
+    {
+      if (start <= end)
+	return;
+      s++;
+    }
+  UTYPE n = (end - start + s) / step;
+#else
+  UTYPE n;
+  if (flags & GOMP_TASK_FLAG_UP)
+    {
+      if (start >= end)
+	return;
+      n = (end - start + step - 1) / step;
+    }
+  else
+    {
+      if (start <= end)
+	return;
+      n = (start - end - step - 1) / -step;
+    }
+#endif
+
+  TYPE task_step = step;
+  unsigned long nfirst = n;
+  if (flags & GOMP_TASK_FLAG_GRAINSIZE)
+    {
+      unsigned long grainsize = num_tasks;
+#ifdef TYPE_is_long
+      num_tasks = n / grainsize;
+#else
+      UTYPE ndiv = n / grainsize;
+      num_tasks = ndiv;
+      if (num_tasks != ndiv)
+	num_tasks = ~0UL;
+#endif
+      if (num_tasks <= 1)
+	{
+	  num_tasks = 1;
+	  task_step = end - start;
+	}
+      else if (num_tasks >= grainsize
+#ifndef TYPE_is_long
+	       && num_tasks != ~0UL
+#endif
+	      )
+	{
+	  UTYPE mul = num_tasks * grainsize;
+	  task_step = (TYPE) grainsize * step;
+	  if (mul != n)
+	    {
+	      task_step += step;
+	      nfirst = n - mul - 1;
+	    }
+	}
+      else
+	{
+	  UTYPE div = n / num_tasks;
+	  UTYPE mod = n % num_tasks;
+	  task_step = (TYPE) div * step;
+	  if (mod)
+	    {
+	      task_step += step;
+	      nfirst = mod - 1;
+	    }
+	}
+    }
+  else
+    {
+      if (num_tasks == 0)
+	num_tasks = team ? team->nthreads : 1;
+      if (num_tasks >= n)
+	num_tasks = n;
+      else
+	{
+	  UTYPE div = n / num_tasks;
+	  UTYPE mod = n % num_tasks;
+	  task_step = (TYPE) div * step;
+	  if (mod)
+	    {
+	      task_step += step;
+	      nfirst = mod - 1;
+	    }
+	}
+    }
+
+  if (flags & GOMP_TASK_FLAG_NOGROUP)
+    {
+      if (thr->task && thr->task->taskgroup && thr->task->taskgroup->cancelled)
+	return;
+    }
+  else
+    ialias_call (GOMP_taskgroup_start) ();
+
+  /* FIXME, use priority.  */
+  (void) priority;
+
+  if ((flags & GOMP_TASK_FLAG_IF) == 0 || team == NULL
+      || (thr->task && thr->task->final_task)
+      || team->task_count + num_tasks > 64 * team->nthreads)
+    {
+      unsigned long i;
+      if (__builtin_expect (cpyfn != NULL, 0))
+	{
+	  struct gomp_task task[num_tasks];
+	  struct gomp_task *parent = thr->task;
+	  arg_size = (arg_size + arg_align - 1) & ~(arg_align - 1);
+	  char buf[num_tasks * arg_size + arg_align - 1];
+	  char *arg = (char *) (((uintptr_t) buf + arg_align - 1)
+				& ~(uintptr_t) (arg_align - 1));
+	  char *orig_arg = arg;
+	  for (i = 0; i < num_tasks; i++)
+	    {
+	      gomp_init_task (&task[i], parent, gomp_icv (false));
+	      task[i].kind = GOMP_TASK_UNDEFERRED;
+	      task[i].final_task = (thr->task && thr->task->final_task)
+				   || (flags & GOMP_TASK_FLAG_FINAL);
+	      if (thr->task)
+		{
+		  task[i].in_tied_task = thr->task->in_tied_task;
+		  task[i].taskgroup = thr->task->taskgroup;
+		}
+	      thr->task = &task[i];
+	      cpyfn (arg, data);
+	      arg += arg_size;
+	    }
+	  arg = orig_arg;
+	  for (i = 0; i < num_tasks; i++)
+	    {
+	      thr->task = &task[i];
+	      ((TYPE *)arg)[0] = start;
+	      start += task_step;
+	      ((TYPE *)arg)[1] = start;
+	      if (i == nfirst)
+		task_step -= step;
+	      fn (arg);
+	      arg += arg_size;
+	      if (task[i].children != NULL)
+		{
+		  gomp_mutex_lock (&team->task_lock);
+		  gomp_clear_parent (task[i].children);
+		  gomp_mutex_unlock (&team->task_lock);
+		}
+	      gomp_end_task ();
+	    }
+	}
+      else
+	for (i = 0; i < num_tasks; i++)
+	  {
+	    struct gomp_task task;
+
+	    gomp_init_task (&task, thr->task, gomp_icv (false));
+	    task.kind = GOMP_TASK_UNDEFERRED;
+	    task.final_task = (thr->task && thr->task->final_task)
+			      || (flags & GOMP_TASK_FLAG_FINAL);
+	    if (thr->task)
+	      {
+		task.in_tied_task = thr->task->in_tied_task;
+		task.taskgroup = thr->task->taskgroup;
+	      }
+	    thr->task = &task;
+	    ((TYPE *)data)[0] = start;
+	    start += task_step;
+	    ((TYPE *)data)[1] = start;
+	    if (i == nfirst)
+	      task_step -= step;
+	    fn (data);
+	    if (task.children != NULL)
+	      {
+		gomp_mutex_lock (&team->task_lock);
+		gomp_clear_parent (task.children);
+		gomp_mutex_unlock (&team->task_lock);
+	      }
+	    gomp_end_task ();
+	  }
+    }
+  else
+    {
+      struct gomp_task *tasks[num_tasks];
+      struct gomp_task *parent = thr->task;
+      struct gomp_taskgroup *taskgroup = parent->taskgroup;
+      char *arg;
+      int do_wake;
+      unsigned long i;
+
+      for (i = 0; i < num_tasks; i++)
+	{
+	  struct gomp_task *task
+	    = gomp_malloc (sizeof (*task) + arg_size + arg_align - 1);
+	  tasks[i] = task;
+	  arg = (char *) (((uintptr_t) (task + 1) + arg_align - 1)
+			  & ~(uintptr_t) (arg_align - 1));
+	  gomp_init_task (task, parent, gomp_icv (false));
+	  task->kind = GOMP_TASK_UNDEFERRED;
+	  task->in_tied_task = parent->in_tied_task;
+	  task->taskgroup = taskgroup;
+	  thr->task = task;
+	  if (cpyfn)
+	    {
+	      cpyfn (arg, data);
+	      task->copy_ctors_done = true;
+	    }
+	  else
+	    memcpy (arg, data, arg_size);
+	  ((TYPE *)arg)[0] = start;
+	  start += task_step;
+	  ((TYPE *)arg)[1] = start;
+	  if (i == nfirst)
+	    task_step -= step;
+	  thr->task = parent;
+	  task->kind = GOMP_TASK_WAITING;
+	  task->fn = fn;
+	  task->fn_data = arg;
+	  task->final_task = (flags & GOMP_TASK_FLAG_FINAL) >> 1;
+	}
+      gomp_mutex_lock (&team->task_lock);
+      /* If parallel or taskgroup has been cancelled, don't start new
+	 tasks.  */
+      if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier)
+			     || (taskgroup && taskgroup->cancelled))
+			    && cpyfn == NULL, 0))
+	{
+	  gomp_mutex_unlock (&team->task_lock);
+	  for (i = 0; i < num_tasks; i++)
+	    {
+	      gomp_finish_task (tasks[i]);
+	      free (tasks[i]);
+	    }
+	  if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0)
+	    ialias_call (GOMP_taskgroup_end) ();
+	  return;
+	}
+      if (taskgroup)
+	taskgroup->num_children += num_tasks;
+      for (i = 0; i < num_tasks; i++)
+	{
+	  struct gomp_task *task = tasks[i];
+	  if (parent->children)
+	    {
+	      task->next_child = parent->children;
+	      task->prev_child = parent->children->prev_child;
+	      task->next_child->prev_child = task;
+	      task->prev_child->next_child = task;
+	    }
+	  else
+	    {
+	      task->next_child = task;
+	      task->prev_child = task;
+	    }
+	  parent->children = task;
+	  if (taskgroup)
+	    {
+	      if (taskgroup->children)
+		{
+		  task->next_taskgroup = taskgroup->children;
+		  task->prev_taskgroup = taskgroup->children->prev_taskgroup;
+		  task->next_taskgroup->prev_taskgroup = task;
+		  task->prev_taskgroup->next_taskgroup = task;
+		}
+	      else
+		{
+		  task->next_taskgroup = task;
+		  task->prev_taskgroup = task;
+		}
+	      taskgroup->children = task;
+	    }
+	  if (team->task_queue)
+	    {
+	      task->next_queue = team->task_queue;
+	      task->prev_queue = team->task_queue->prev_queue;
+	      task->next_queue->prev_queue = task;
+	      task->prev_queue->next_queue = task;
+	    }
+	  else
+	    {
+	      task->next_queue = task;
+	      task->prev_queue = task;
+	      team->task_queue = task;
+	    }
+	  ++team->task_count;
+	  ++team->task_queued_count;
+	}
+      gomp_team_barrier_set_task_pending (&team->barrier);
+      if (team->task_running_count + !parent->in_tied_task
+	  < team->nthreads)
+	{
+	  do_wake = team->nthreads - team->task_running_count
+		    - !parent->in_tied_task;
+	  if ((unsigned long) do_wake > num_tasks)
+	    do_wake = num_tasks;
+	}
+      else
+	do_wake = 0;
+      gomp_mutex_unlock (&team->task_lock);
+      if (do_wake)
+	gomp_team_barrier_wake (&team->barrier, do_wake);
+    }
+  if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0)
+    ialias_call (GOMP_taskgroup_end) ();
+}
diff --git a/libgomp/testsuite/lib/libgomp.exp b/libgomp/testsuite/lib/libgomp.exp
index 1040c29e0eb..6dc1e8ef3ca 100644
--- a/libgomp/testsuite/lib/libgomp.exp
+++ b/libgomp/testsuite/lib/libgomp.exp
@@ -321,6 +321,19 @@ proc check_effective_target_offload_device { } {
     } ]
 }
 
+# Return 1 if offload device is available and it has non-shared address space.
+proc check_effective_target_offload_device_nonshared_as { } {
+    return [check_runtime_nocache offload_device_nonshared_as {
+      int main ()
+	{
+	  int a = 8;
+	  #pragma omp target map(to: a)
+	    a++;
+	  return a != 8;
+	}
+    } ]
+}
+
 # Return 1 if at least one nvidia board is present.
 
 proc check_effective_target_openacc_nvidia_accel_present { } {
diff --git a/libgomp/testsuite/libgomp.c++/ctor-13.C b/libgomp/testsuite/libgomp.c++/ctor-13.C
new file mode 100644
index 00000000000..8c7a09f315d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/ctor-13.C
@@ -0,0 +1,242 @@
+// { dg-do run }
+
+#include <omp.h>
+#include <assert.h>
+
+struct B
+{
+  static int ic, dc, xc, ac, cc;
+
+  B();
+  B(const B &);
+  ~B();
+  B& operator=(const B &);
+  void doit();
+  static void clear();
+};
+
+int B::ic;
+int B::dc;
+int B::xc;
+int B::cc;
+int B::ac;
+
+B::B()
+{
+  #pragma omp atomic
+    ic++;
+}
+
+B::~B()
+{
+  #pragma omp atomic
+    dc++;
+}
+
+B::B(const B &)
+{
+  #pragma omp atomic
+    cc++;
+}
+
+B& B::operator=(const B &)
+{
+  #pragma omp atomic
+    ac++;
+  return *this;
+}
+
+void B::doit()
+{
+  #pragma omp atomic
+    xc++;
+}
+
+void B::clear()
+{
+  ic = 0;
+  dc = 0;
+  cc = 0;
+  ac = 0;
+  xc = 0;
+}
+
+static int n;
+
+void f1(B &a)
+{
+  B b;
+  B &c = b;
+  #pragma omp parallel default(none) private(a, c) shared (n)
+    {
+      #pragma omp master
+	n = omp_get_num_threads ();
+      a.doit();
+      c.doit();
+    }
+}
+
+void f2(B &a)
+{
+  B b;
+  B &c = b;
+  #pragma omp parallel default(none) firstprivate(a, c) shared(n)
+    {
+      #pragma omp master
+	n = omp_get_num_threads ();
+      a.doit();
+      c.doit();
+    }
+}
+
+void f3(B &a)
+{
+  B b;
+  B &c = b;
+  #pragma omp parallel default(none) shared(n, a, c)
+    {
+      #pragma omp master
+	n = omp_get_num_threads ();
+      #pragma omp for lastprivate (a, c)
+      for (int i = 0; i < omp_get_num_threads (); i++)
+	{
+	  a.doit();
+	  c.doit();
+	}
+    }
+}
+
+void f4()
+{
+  B b;
+  B &c = b;
+  #pragma omp parallel default(none) private (c) shared (n)
+    {
+      B d;
+      B &e = d;
+      #pragma omp single copyprivate (c, e)
+      {
+	c.doit();
+	e.doit();
+      }
+      c.doit();
+      e.doit();
+    }
+}
+
+void f5(B (&a)[2])
+{
+  B b[2];
+  B (&c)[2] = b;
+  #pragma omp parallel default(none) private(a, c) shared (n)
+    {
+      #pragma omp master
+	n = omp_get_num_threads ();
+      a[0].doit();
+      a[1].doit();
+      c[0].doit();
+      c[1].doit();
+    }
+}
+
+void f6(B (&a)[2])
+{
+  B b[2];
+  B (&c)[2] = b;
+  #pragma omp parallel default(none) firstprivate(a, c) shared (n)
+    {
+      #pragma omp master
+	n = omp_get_num_threads ();
+      a[0].doit();
+      a[1].doit();
+      c[0].doit();
+      c[1].doit();
+    }
+}
+
+void f7(B (&a)[2])
+{
+  B b[2];
+  B (&c)[2] = b;
+  #pragma omp parallel default(none) shared(n, a, c)
+    {
+      #pragma omp master
+	n = omp_get_num_threads ();
+      #pragma omp for lastprivate (a, c)
+      for (int i = 0; i < omp_get_num_threads (); i++)
+	{
+	  a[0].doit();
+	  a[1].doit();
+	  c[0].doit();
+	  c[1].doit();
+	}
+    }
+}
+
+void f8()
+{
+  B b[2];
+  B (&c)[2] = b;
+  #pragma omp parallel default(none) private (c) shared (n)
+    {
+      B d[2];
+      B (&e)[2] = d;
+      #pragma omp single copyprivate (c, e)
+      {
+	c[0].doit();
+	c[1].doit();
+	e[0].doit();
+	e[1].doit();
+      }
+      c[0].doit();
+      c[1].doit();
+      e[0].doit();
+      e[1].doit();
+    }
+}
+
+int main()
+{
+  {
+    B a;
+    f1(a);
+  }
+  assert (B::xc == 2*n && B::ic == 2*n+2 && B::dc == 2*n+2 && B::ac == 0 && B::cc == 0);
+  B::clear();
+  {
+    B a;
+    f2(a);
+  }
+  assert (B::xc == 2*n && B::ic == 2 && B::dc == 2*n+2 && B::ac == 0 && B::cc == 2*n);
+  B::clear();
+  {
+    B a;
+    f3(a);
+  }
+  assert (B::xc == 2*n && B::ic == 2*n+2 && B::dc == 2*n+2 && B::ac == 2 && B::cc == 0);
+  B::clear();
+  f4();
+  assert (B::xc == 2*n+2 && B::ic == 2*n+1 && B::dc == 2*n+1 && B::ac == 2*n-2 && B::cc == 0);
+  B::clear();
+  {
+    B a[2];
+    f5(a);
+  }
+  assert (B::xc == 4*n && B::ic == 4*n+4 && B::dc == 4*n+4 && B::ac == 0 && B::cc == 0);
+  B::clear();
+  {
+    B a[2];
+    f6(a);
+  }
+  assert (B::xc == 4*n && B::ic == 4 && B::dc == 4*n+4 && B::ac == 0 && B::cc == 4*n);
+  B::clear();
+  {
+    B a[2];
+    f7(a);
+  }
+  assert (B::xc == 4*n && B::ic == 4*n+4 && B::dc == 4*n+4 && B::ac == 4 && B::cc == 0);
+  B::clear();
+  f8();
+  assert (B::xc == 4*n+4 && B::ic == 4*n+2 && B::dc == 4*n+2 && B::ac == 4*n-4 && B::cc == 0);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/doacross-1.C b/libgomp/testsuite/libgomp.c++/doacross-1.C
new file mode 100644
index 00000000000..bc53ee6e8a2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/doacross-1.C
@@ -0,0 +1,294 @@
+// { dg-do run }
+
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+extern "C" void abort ();
+
+template <typename T>
+class I
+{
+public:
+  typedef ptrdiff_t difference_type;
+  I ();
+  ~I ();
+  I (T *);
+  I (const I &);
+  T &operator * ();
+  T *operator -> ();
+  T &operator [] (const difference_type &) const;
+  I &operator = (const I &);
+  I &operator ++ ();
+  I operator ++ (int);
+  I &operator -- ();
+  I operator -- (int);
+  I &operator += (const difference_type &);
+  I &operator -= (const difference_type &);
+  I operator + (const difference_type &) const;
+  I operator - (const difference_type &) const;
+  template <typename S> friend bool operator == (I<S> &, I<S> &);
+  template <typename S> friend bool operator == (const I<S> &, const I<S> &);
+  template <typename S> friend bool operator < (I<S> &, I<S> &);
+  template <typename S> friend bool operator < (const I<S> &, const I<S> &);
+  template <typename S> friend bool operator <= (I<S> &, I<S> &);
+  template <typename S> friend bool operator <= (const I<S> &, const I<S> &);
+  template <typename S> friend bool operator > (I<S> &, I<S> &);
+  template <typename S> friend bool operator > (const I<S> &, const I<S> &);
+  template <typename S> friend bool operator >= (I<S> &, I<S> &);
+  template <typename S> friend bool operator >= (const I<S> &, const I<S> &);
+  template <typename S> friend typename I<S>::difference_type operator - (I<S> &, I<S> &);
+  template <typename S> friend typename I<S>::difference_type operator - (const I<S> &, const I<S> &);
+  template <typename S> friend I<S> operator + (typename I<S>::difference_type , const I<S> &);
+private:
+  T *p;
+};
+template <typename T> I<T>::I () : p (0) {}
+template <typename T> I<T>::~I () {}
+template <typename T> I<T>::I (T *x) : p (x) {}
+template <typename T> I<T>::I (const I &x) : p (x.p) {}
+template <typename T> T &I<T>::operator * () { return *p; }
+template <typename T> T *I<T>::operator -> () { return p; }
+template <typename T> T &I<T>::operator [] (const difference_type &x) const { return p[x]; }
+template <typename T> I<T> &I<T>::operator = (const I &x) { p = x.p; return *this; }
+template <typename T> I<T> &I<T>::operator ++ () { ++p; return *this; }
+template <typename T> I<T> I<T>::operator ++ (int) { return I (p++); }
+template <typename T> I<T> &I<T>::operator -- () { --p; return *this; }
+template <typename T> I<T> I<T>::operator -- (int) { return I (p--); }
+template <typename T> I<T> &I<T>::operator += (const difference_type &x) { p += x; return *this; }
+template <typename T> I<T> &I<T>::operator -= (const difference_type &x) { p -= x; return *this; }
+template <typename T> I<T> I<T>::operator + (const difference_type &x) const { return I (p + x); }
+template <typename T> I<T> I<T>::operator - (const difference_type &x) const { return I (p - x); }
+template <typename T> bool operator == (I<T> &x, I<T> &y) { return x.p == y.p; }
+template <typename T> bool operator == (const I<T> &x, const I<T> &y) { return x.p == y.p; }
+template <typename T> bool operator != (I<T> &x, I<T> &y) { return !(x == y); }
+template <typename T> bool operator != (const I<T> &x, const I<T> &y) { return !(x == y); }
+template <typename T> bool operator < (I<T> &x, I<T> &y) { return x.p < y.p; }
+template <typename T> bool operator < (const I<T> &x, const I<T> &y) { return x.p < y.p; }
+template <typename T> bool operator <= (I<T> &x, I<T> &y) { return x.p <= y.p; }
+template <typename T> bool operator <= (const I<T> &x, const I<T> &y) { return x.p <= y.p; }
+template <typename T> bool operator > (I<T> &x, I<T> &y) { return x.p > y.p; }
+template <typename T> bool operator > (const I<T> &x, const I<T> &y) { return x.p > y.p; }
+template <typename T> bool operator >= (I<T> &x, I<T> &y) { return x.p >= y.p; }
+template <typename T> bool operator >= (const I<T> &x, const I<T> &y) { return x.p >= y.p; }
+template <typename T> typename I<T>::difference_type operator - (I<T> &x, I<T> &y) { return x.p - y.p; }
+template <typename T> typename I<T>::difference_type operator - (const I<T> &x, const I<T> &y) { return x.p - y.p; }
+template <typename T> I<T> operator + (typename I<T>::difference_type x, const I<T> &y) { return I<T> (x + y.p); }
+
+int results[2048];
+
+template <typename T>
+void
+baz (I<T> &i, I<T> &j, I<T> &k, T &l)
+{
+  if (*i < 0 || *i >= 16)
+    abort ();
+  if (*j < 0 || *j >= 16)
+    abort ();
+  if (*k < 0 || *k >= 16)
+    abort ();
+  if (l < 0 || l >= 16)
+    abort ();
+  #pragma omp atomic
+    results[512 * *i + 64 * *j + 8 * *k + l]++;
+}
+
+template <typename T>
+void
+baz (T &i, T &j, T &k, T &l)
+{
+  if (i < 0 || i >= 16)
+    abort ();
+  if (j < 0 || j >= 16)
+    abort ();
+  if (k < 0 || k >= 16)
+    abort ();
+  if (l < 0 || l >= 16)
+    abort ();
+  #pragma omp atomic
+    results[512 * i + 64 * j + 8 * k + l]++;
+}
+
+void
+f1 (const I<int> &a, const I<int> &b, const I<int> &c, const I<int> &d,
+    const I<int> &e, const I<int> &f, int g, int h,
+    I<int> &r1, I<int> &r2, I<int> &r3)
+{
+  I<int> i, j, k;
+  int l;
+#pragma omp parallel for ordered(4) lastprivate (i, j, k) schedule(static, 1)
+  for (i = a; i <= b; i++)
+    for (j = c; j < d; j++)
+      for (k = e; k < f; k++)
+	for (l = g; l < h; l++)
+	  {
+	    #pragma omp ordered depend(sink: i - 1, j, k + 1, l - 2)
+	    baz (i, j, k, l);
+	    if (i > a && k < f - 1 && l > g + 1)
+	      {
+		int m;
+		#pragma omp atomic read
+		m = results[512 * *(i - 1) + 64 * *j + 8 * *(k + 1) + l - 2];
+		if (m == 0)
+		  abort ();
+	      }
+	    #pragma omp ordered depend(source)
+	  }
+  r1 = i;
+  r2 = j;
+  r3 = k;
+}
+
+void
+f2 (int a, int b, int c, int d, int e, int f, int g, int h, int &r1, int &r2, int &r3)
+{
+  int i, j, k, l;
+#pragma omp parallel for collapse (1) ordered(4) lastprivate (i, j, k) schedule(static, 2)
+  for (i = a; i <= b; i++)
+    for (j = c; j < d; j++)
+      for (k = e; k < f; k++)
+	for (l = g; l < h; l++)
+	  {
+	    #pragma omp ordered depend(sink: i - 1, j, k + 1, l - 2)
+	    baz (i, j, k, l);
+	    if (i > a && k < f - 1 && l > g + 1)
+	      {
+		int m;
+		#pragma omp atomic read
+		m = results[512 * (i - 1) + 64 * j + 8 * (k + 1) + l - 2];
+		if (m == 0)
+		  abort ();
+	      }
+	    #pragma omp ordered depend(source)
+	  }
+  r1 = i;
+  r2 = j;
+  r3 = k;
+}
+
+void
+f3 (const I<int> &a, const I<int> &b, const I<int> &c, const I<int> &d,
+    const I<int> &e, const I<int> &f, int g, int h,
+    I<int> &r1, I<int> &r2, I<int> &r3)
+{
+  I<int> i, j, k;
+  int l;
+#pragma omp parallel for collapse (2) ordered(4) lastprivate (i, j, k) schedule(static, 1)
+  for (i = a; i <= b; i++)
+    for (j = c; j < d; j++)
+      for (k = e; k < f; k++)
+	for (l = g; l < h; l++)
+	  {
+	    #pragma omp ordered depend(sink: i - 1, j, k + 1, l - 2)
+	    baz (i, j, k, l);
+	    if (i > a && k < f - 1 && l > g + 1)
+	      {
+		int m;
+		#pragma omp atomic read
+		m = results[512 * *(i - 1) + 64 * *j + 8 * *(k + 1) + l - 2];
+		if (m == 0)
+		  abort ();
+	      }
+	    #pragma omp ordered depend(source)
+	  }
+  r1 = i;
+  r2 = j;
+  r3 = k;
+}
+
+void
+f4 (int a, int b, int c, int d, int e, int f, int g, int h, int &r1, int &r2, int &r3)
+{
+  int i, j, k, l;
+#pragma omp parallel for collapse (2) ordered(4) lastprivate (i, j, k) schedule(static, 2)
+  for (i = a; i <= b; i++)
+    for (j = c; j < d; j++)
+      for (k = e; k < f; k++)
+	for (l = g; l < h; l++)
+	  {
+	    #pragma omp ordered depend(sink: i - 1, j, k + 1, l - 2)
+	    baz (i, j, k, l);
+	    if (i > a && k < f - 1 && l > g + 1)
+	      {
+		int m;
+		#pragma omp atomic read
+		m = results[512 * (i - 1) + 64 * j + 8 * (k + 1) + l - 2];
+		if (m == 0)
+		  abort ();
+	      }
+	    #pragma omp ordered depend(source)
+	  }
+  r1 = i;
+  r2 = j;
+  r3 = k;
+}
+
+#define check(expr) \
+  for (int i = 0; i < 2048; i++)			\
+    if (expr)						\
+      {							\
+	if (results[i] != 1)				\
+	  abort ();					\
+	results[i] = 0;					\
+      }							\
+    else if (results[i])				\
+      abort ()
+
+int
+main ()
+{
+  int a[16], s1, s2, s3;
+  I<int> r1, r2, r3;
+  for (int i = 0; i < 16; i++)
+    a[i] = i;
+  r1 = &a[15]; r2 = &a[15]; r3 = &a[15];
+  f1 (&a[1], &a[3], &a[2], &a[5], &a[1], &a[3], 0, 5, r1, r2, r3);
+  if (*r1 != 4 || *r2 != 5 || *r3 != 3)
+    abort ();
+  check ((i / 512) - 1U < 3U && ((i / 64) & 7) - 2U < 3U && ((i / 8) & 7) - 1U < 2U && (i & 7) < 5);
+  r1 = &a[15]; r2 = &a[15]; r3 = &a[15];
+  f1 (&a[1], &a[3], &a[1], &a[4], &a[1], &a[5], 1, 0, r1, r2, r3);
+  if (*r1 != 4 || *r2 != 4 || *r3 != 5)
+    abort ();
+  r1 = &a[15]; r2 = &a[15]; r3 = &a[15];
+  f1 (&a[1], &a[3], &a[1], &a[9], &a[7], &a[2], 0, 7, r1, r2, r3);
+  if (*r1 != 4 || *r2 != 9 || *r3 != 7)
+    abort ();
+  s1 = 15; s2 = 15; s3 = 15;
+  f2 (1, 3, 2, 5, 1, 3, 0, 5, s1, s2, s3);
+  if (s1 != 4 || s2 != 5 || s3 != 3)
+    abort ();
+  check ((i / 512) - 1U < 3U && ((i / 64) & 7) - 2U < 3U && ((i / 8) & 7) - 1U < 2U && (i & 7) < 5);
+  s1 = 15; s2 = 15; s3 = 15;
+  f2 (1, 3, 1, 4, 1, 5, 1, 0, s1, s2, s3);
+  if (s1 != 4 || s2 != 4 || s3 != 5)
+    abort ();
+  s1 = 15; s2 = 15; s3 = 15;
+  f2 (1, 3, 1, 9, 7, 2, 0, 7, s1, s2, s3);
+  if (s1 != 4 || s2 != 9 || s3 != 7)
+    abort ();
+  r1 = &a[15]; r2 = &a[15]; r3 = &a[15];
+  f3 (&a[1], &a[3], &a[2], &a[5], &a[1], &a[3], 0, 5, r1, r2, r3);
+  if (*r1 != 4 || *r2 != 5 || *r3 != 3)
+    abort ();
+  check ((i / 512) - 1U < 3U && ((i / 64) & 7) - 2U < 3U && ((i / 8) & 7) - 1U < 2U && (i & 7) < 5);
+  r1 = &a[15]; r2 = &a[15]; r3 = &a[15];
+  f3 (&a[1], &a[3], &a[1], &a[4], &a[1], &a[5], 1, 0, r1, r2, r3);
+  if (*r1 != 4 || *r2 != 4 || *r3 != 5)
+    abort ();
+  r1 = &a[15]; r2 = &a[15]; r3 = &a[15];
+  f3 (&a[1], &a[3], &a[1], &a[9], &a[7], &a[2], 0, 7, r1, r2, r3);
+  if (*r1 != 4 || *r2 != 9 || *r3 != 7)
+    abort ();
+  s1 = 15; s2 = 15; s3 = 15;
+  f4 (1, 3, 2, 5, 1, 3, 0, 5, s1, s2, s3);
+  if (s1 != 4 || s2 != 5 || s3 != 3)
+    abort ();
+  check ((i / 512) - 1U < 3U && ((i / 64) & 7) - 2U < 3U && ((i / 8) & 7) - 1U < 2U && (i & 7) < 5);
+  s1 = 15; s2 = 15; s3 = 15;
+  f4 (1, 3, 1, 4, 1, 5, 1, 0, s1, s2, s3);
+  if (s1 != 4 || s2 != 4 || s3 != 5)
+    abort ();
+  s1 = 15; s2 = 15; s3 = 15;
+  f4 (1, 3, 1, 9, 7, 2, 0, 7, s1, s2, s3);
+  if (s1 != 4 || s2 != 9 || s3 != 7)
+    abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/examples-4/declare_target-2.C b/libgomp/testsuite/libgomp.c++/examples-4/declare_target-2.C
index 75276e7c5c6..6d5b5e47990 100644
--- a/libgomp/testsuite/libgomp.c++/examples-4/declare_target-2.C
+++ b/libgomp/testsuite/libgomp.c++/examples-4/declare_target-2.C
@@ -1,5 +1,5 @@
 // { dg-do run }
-// { dg-require-effective-target offload_device }
+// { dg-require-effective-target offload_device_nonshared_as }
 
 #include <stdlib.h>
 
diff --git a/libgomp/testsuite/libgomp.c++/for-12.C b/libgomp/testsuite/libgomp.c++/for-12.C
new file mode 100644
index 00000000000..ea32192e45d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/for-12.C
@@ -0,0 +1,42 @@
+/* { dg-options "-fopenmp" } */
+
+extern "C" void abort (void);
+
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#define F taskloop
+#define G taskloop
+#define S
+#define N(x) M(x, G, normal)
+#include "../libgomp.c/for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F taskloop simd
+#define G taskloop_simd
+#define S
+#define N(x) M(x, G, normal)
+#include "../libgomp.c/for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+int
+main ()
+{
+  int err = 0;
+  #pragma omp parallel reduction(|:err)
+    #pragma omp single
+      {
+	if (test_taskloop_normal ()
+	    || test_taskloop_simd_normal ())
+	  err = 1;
+      }
+  if (err)
+    abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/for-13.C b/libgomp/testsuite/libgomp.c++/for-13.C
new file mode 100644
index 00000000000..ac1601a766f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/for-13.C
@@ -0,0 +1,151 @@
+extern "C" void abort ();
+
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#pragma omp declare target
+
+#define F for
+#define G f
+#define S
+#define N(x) M(x, G, normal)
+#include "../libgomp.c/for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#pragma omp end declare target
+
+#undef OMPFROM
+#undef OMPTO
+#define DO_PRAGMA(x) _Pragma (#x)
+#define OMPFROM(v) DO_PRAGMA (omp target update from(v))
+#define OMPTO(v) DO_PRAGMA (omp target update to(v))
+
+#define F target parallel for
+#define G tpf
+#include "../libgomp.c/for-1.h"
+#undef F
+#undef G
+
+#define F target simd
+#define G t_simd
+#define S
+#define N(x) M(x, G, normal)
+#include "../libgomp.c/for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F target parallel for simd
+#define G tpf_simd
+#include "../libgomp.c/for-1.h"
+#undef F
+#undef G
+
+#define F target teams distribute
+#define G ttd
+#define S
+#define N(x) M(x, G, normal)
+#include "../libgomp.c/for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F target teams distribute
+#define G ttd_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "../libgomp.c/for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F target teams distribute simd
+#define G ttds
+#define S
+#define N(x) M(x, G, normal)
+#include "../libgomp.c/for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F target teams distribute simd
+#define G ttds_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "../libgomp.c/for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F target teams distribute parallel for
+#define G ttdpf
+#include "../libgomp.c/for-1.h"
+#undef F
+#undef G
+
+#define F target teams distribute parallel for dist_schedule(static, 128)
+#define G ttdpf_ds128
+#include "../libgomp.c/for-1.h"
+#undef F
+#undef G
+
+#define F target teams distribute parallel for simd
+#define G ttdpfs
+#include "../libgomp.c/for-1.h"
+#undef F
+#undef G
+
+#define F target teams distribute parallel for simd dist_schedule(static, 128)
+#define G ttdpfs_ds128
+#include "../libgomp.c/for-1.h"
+#undef F
+#undef G
+
+int
+main ()
+{
+  if (test_tpf_static ()
+      || test_tpf_static32 ()
+      || test_tpf_auto ()
+      || test_tpf_guided32 ()
+      || test_tpf_runtime ()
+      || test_t_simd_normal ()
+      || test_tpf_simd_static ()
+      || test_tpf_simd_static32 ()
+      || test_tpf_simd_auto ()
+      || test_tpf_simd_guided32 ()
+      || test_tpf_simd_runtime ()
+      || test_ttd_normal ()
+      || test_ttd_ds128_normal ()
+      || test_ttds_normal ()
+      || test_ttds_ds128_normal ()
+      || test_ttdpf_static ()
+      || test_ttdpf_static32 ()
+      || test_ttdpf_auto ()
+      || test_ttdpf_guided32 ()
+      || test_ttdpf_runtime ()
+      || test_ttdpf_ds128_static ()
+      || test_ttdpf_ds128_static32 ()
+      || test_ttdpf_ds128_auto ()
+      || test_ttdpf_ds128_guided32 ()
+      || test_ttdpf_ds128_runtime ()
+      || test_ttdpfs_static ()
+      || test_ttdpfs_static32 ()
+      || test_ttdpfs_auto ()
+      || test_ttdpfs_guided32 ()
+      || test_ttdpfs_runtime ()
+      || test_ttdpfs_ds128_static ()
+      || test_ttdpfs_ds128_static32 ()
+      || test_ttdpfs_ds128_auto ()
+      || test_ttdpfs_ds128_guided32 ()
+      || test_ttdpfs_ds128_runtime ())
+    abort ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/for-14.C b/libgomp/testsuite/libgomp.c++/for-14.C
new file mode 100644
index 00000000000..7738473b601
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/for-14.C
@@ -0,0 +1,120 @@
+extern "C" void abort ();
+
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#pragma omp declare target
+
+#define F for
+#define G f
+#define S
+#define N(x) M(x, G, normal)
+#include "../libgomp.c/for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#pragma omp end declare target
+
+#undef OMPTGT
+#undef OMPFROM
+#undef OMPTO
+#define DO_PRAGMA(x) _Pragma (#x)
+#define OMPTGT DO_PRAGMA (omp target)
+#define OMPFROM(v) DO_PRAGMA (omp target update from(v))
+#define OMPTO(v) DO_PRAGMA (omp target update to(v))
+
+#define F teams distribute
+#define G td
+#define S
+#define N(x) M(x, G, normal)
+#include "../libgomp.c/for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F teams distribute
+#define G td_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "../libgomp.c/for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F teams distribute simd
+#define G tds
+#define S
+#define N(x) M(x, G, normal)
+#include "../libgomp.c/for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F teams distribute simd
+#define G tds_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "../libgomp.c/for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F teams distribute parallel for
+#define G tdpf
+#include "../libgomp.c/for-1.h"
+#undef F
+#undef G
+
+#define F teams distribute parallel for dist_schedule(static, 128)
+#define G tdpf_ds128
+#include "../libgomp.c/for-1.h"
+#undef F
+#undef G
+
+#define F teams distribute parallel for simd
+#define G tdpfs
+#include "../libgomp.c/for-1.h"
+#undef F
+#undef G
+
+#define F teams distribute parallel for simd dist_schedule(static, 128)
+#define G tdpfs_ds128
+#include "../libgomp.c/for-1.h"
+#undef F
+#undef G
+
+int
+main ()
+{
+  if (test_td_normal ()
+      || test_td_ds128_normal ()
+      || test_tds_normal ()
+      || test_tds_ds128_normal ()
+      || test_tdpf_static ()
+      || test_tdpf_static32 ()
+      || test_tdpf_auto ()
+      || test_tdpf_guided32 ()
+      || test_tdpf_runtime ()
+      || test_tdpf_ds128_static ()
+      || test_tdpf_ds128_static32 ()
+      || test_tdpf_ds128_auto ()
+      || test_tdpf_ds128_guided32 ()
+      || test_tdpf_ds128_runtime ()
+      || test_tdpfs_static ()
+      || test_tdpfs_static32 ()
+      || test_tdpfs_auto ()
+      || test_tdpfs_guided32 ()
+      || test_tdpfs_runtime ()
+      || test_tdpfs_ds128_static ()
+      || test_tdpfs_ds128_static32 ()
+      || test_tdpfs_ds128_auto ()
+      || test_tdpfs_ds128_guided32 ()
+      || test_tdpfs_ds128_runtime ())
+    abort ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/linear-1.C b/libgomp/testsuite/libgomp.c++/linear-1.C
new file mode 100644
index 00000000000..1dd1ffc8939
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/linear-1.C
@@ -0,0 +1,268 @@
+int a[256];
+
+__attribute__((noinline, noclone)) int
+f1 (int i)
+{
+  #pragma omp parallel for linear (i: 4)
+  for (int j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) short int &
+f2 (short int &i, char k)
+{
+  #pragma omp parallel for linear (i: k + 1)
+  for (long j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+template <typename T>
+__attribute__((noinline, noclone)) T
+f3 (T i, T k)
+{
+  #pragma omp parallel for linear (i: k)
+  for (short j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+template <typename T>
+__attribute__((noinline, noclone)) T &
+f4 (T &i)
+{
+  #pragma omp parallel for linear (i: 4) schedule(static, 3)
+  for (int j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) short int
+f5 (short int i, char &k)
+{
+  #pragma omp parallel for linear (i: k + 1) schedule(static, 5)
+  for (long j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+template <int N>
+__attribute__((noinline, noclone)) long long int
+f6 (long long int i, long long int k)
+{
+  #pragma omp parallel for linear (i: k) schedule(static, 7)
+  for (short j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) int
+f7 (int &i)
+{
+  #pragma omp parallel for linear (i: 4) schedule(dynamic, 3)
+  for (int j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) short int
+f8 (short int i, char k)
+{
+  #pragma omp parallel for linear (i: k + 1) schedule(dynamic, 5)
+  for (long j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) long long int
+f9 (long long int i, long long int k)
+{
+  #pragma omp parallel for linear (i: k) schedule(dynamic, 7)
+  for (short j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+template <typename T>
+__attribute__((noinline, noclone)) T &
+f10 (T &i, long &step)
+{
+  #pragma omp parallel for linear (i: 4)
+  for (int j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) short int
+f11 (short int i, char k, char step)
+{
+  #pragma omp parallel for linear (i: k + 1)
+  for (long j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) long long int
+f12 (long long int i, long long int k, int step)
+{
+  #pragma omp parallel for linear (i: k)
+  for (short j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) int
+f13 (int &i, long long int step)
+{
+  #pragma omp parallel for linear (i: 4) schedule(static, 3)
+  for (int j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) short int
+f14 (short int &i, char &k, int &step)
+{
+  #pragma omp parallel for linear (i: k + 1) schedule(static, 5)
+  for (long j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+template <int N>
+__attribute__((noinline, noclone)) long long int
+f15 (long long int i, long long int k, long int step)
+{
+  #pragma omp parallel for linear (i: k) schedule(static, 7)
+  for (short j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) int
+f16 (int i, long long int step)
+{
+  #pragma omp parallel for linear (i: 4) schedule(dynamic, 3)
+  for (int j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) short int
+f17 (short int i, char k, int step)
+{
+  #pragma omp parallel for linear (i: k + 1) schedule(dynamic, 5)
+  for (long j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+template <typename T>
+__attribute__((noinline, noclone)) T
+f18 (T i, T k, long int step)
+{
+  #pragma omp parallel for linear (i: k) schedule(dynamic, 7)
+  for (short j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+int
+main ()
+{
+#define TEST(x) \
+  if (x != 8 + 48 * 4)				\
+    __builtin_abort ();				\
+  for (int i = 0; i < 256; i++)			\
+    if (a[i] != (((i & 3) == 0 && i >= 8	\
+		  && i < 8 + 48 * 4)		\
+		 ? ((i - 8) / 4) + 16 : 0))	\
+      __builtin_abort ();			\
+  __builtin_memset (a, 0, sizeof (a))
+  TEST (f1 (8));
+  short int vs = 8;
+  TEST (f2 (vs, 3));
+  TEST (f3 (8LL, 4LL));
+  int vi = 8;
+  TEST (f4 (vi));
+  char vk = 3;
+  TEST (f5 (8, vk));
+  TEST (f6<7> (8LL, 4LL));
+  vi = 8;
+  TEST (f7 (vi));
+  TEST (f8 (8, 3));
+  TEST (f9 (8LL, 4LL));
+  vi = 8;
+  long vl = 2;
+  TEST (f10 (vi, vl));
+  TEST (f11 (8, 3, 2));
+  TEST (f12 (8LL, 4LL, 2));
+  vi = 8;
+  TEST (f13 (vi, 2));
+  vs = 8;
+  vk = 3;
+  vi = 2;
+  TEST (f14 (vs, vk, vi));
+  TEST (f15<9> (8LL, 4LL, 2));
+  TEST (f16 (8, 2));
+  TEST (f17 (8, 3, 2));
+  long long int vll1 = 8LL;
+  long long int vll2 = 4LL;
+  TEST (f18<long long int &> (vll1, vll2, 2));
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/member-1.C b/libgomp/testsuite/libgomp.c++/member-1.C
new file mode 100644
index 00000000000..d2d0c5b2667
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/member-1.C
@@ -0,0 +1,206 @@
+// { dg-do run }
+
+#include <omp.h>
+
+struct R { R () {}; ~R () {}; int r; };
+struct T { T () {}; virtual ~T () {}; int t; };
+int c;
+struct A : public R, virtual public T { A () : b(c) {} int a; int &b; void m1 (); };
+
+void
+take (int &a, int &b, int &c, int &d)
+{
+  asm volatile ("" : : "g" (&a), "g" (&b), "g" (&c), "g" (&d) : "memory");
+}
+
+void
+A::m1 ()
+{
+  #pragma omp parallel private (a, r, T::t, A::b)
+  {
+    int q = omp_get_thread_num ();
+    a = q;
+    r = 2 * q;
+    t = 3 * q;
+    b = 4 * q;
+    take (a, r, t, b);
+    #pragma omp barrier
+    if (A::a != q || R::r != 2 * q || T::t != 3 * q || A::b != 4 * q)
+      __builtin_abort ();
+  }
+  a = 7;
+  r = 8;
+  t = 9;
+  b = 10;
+  #pragma omp parallel firstprivate (A::a, R::r, t, b)
+  {
+    int q = omp_get_thread_num ();
+    take (A::a, R::r, T::t, A::b);
+    if (a != 7 || r != 8 || t != 9 || b != 10)
+      __builtin_abort ();
+    A::a = 5 * q;
+    R::r = 6 * q;
+    T::t = 7 * q;
+    A::b = 8 * q;
+    take (a, r, t, b);
+    #pragma omp barrier
+    if (a != 5 * q || r != 6 * q || t != 7 * q || b != 8 * q)
+      __builtin_abort ();
+  }
+  bool f = false;
+  a = -5;
+  b = -4;
+  r = -3;
+  t = -2;
+  int n;
+  #pragma omp parallel for firstprivate (a, T::t, b, f) lastprivate (A::a, r, t, n)
+  for (int i = 0; i < omp_get_num_threads (); i++)
+    {
+      int q = omp_get_thread_num ();
+      if (!f)
+	{
+	  if (A::a != -5 || A::b != -4 || T::t != -2)
+	    __builtin_abort ();
+	}
+      else if (a != q || b != 2 * q || r != 3 * q || t != 4 * q)
+	__builtin_abort ();
+      take (a, r, t, b);
+      A::a = q;
+      A::b = 2 * q;
+      R::r = 3 * q;
+      T::t = 4 * q;
+      n = q;
+      f = true;
+    }
+  if (a != n || r != 3 * n || T::t != 4 * n)
+    __builtin_abort ();
+  b = 8;
+  #pragma omp parallel
+    #pragma omp single
+      for (int i = 0; i < 5; i++)
+	#pragma omp task firstprivate (t, b, n) private (a, R::r)
+	  {
+	    if (t != 4 * n || b != 8)
+	      __builtin_abort ();
+	    a = 9;
+	    r = 8;
+	    t = 12;
+	    b = 18;
+	    take (a, r, t, b);
+	    if (a != 9 || r != 8 || t != 12 || b != 18)
+	      __builtin_abort ();
+	  }
+  a = 1;
+  b = 2;
+  R::r = 3;
+  t = 4;
+  #pragma omp parallel private (f)
+    {
+      f = false;
+    #pragma omp single
+    #pragma omp taskloop firstprivate (r, T::t, b, f) lastprivate (a, t, b, n)
+      for (int i = 0; i < 30; i++)
+	{
+	  int q = omp_get_thread_num ();
+	  if (!f)
+	    {
+	      if (R::r != 3 || A::b != 2 || T::t != 4)
+		__builtin_abort ();
+	    }
+	  else if (a != 7 * q || b != 8 * q || r != 9 * q || t != 10 * q)
+	    __builtin_abort ();
+	  take (a, r, t, b);
+	  A::a = 7 * q;
+	  A::b = 8 * q;
+	  R::r = 9 * q;
+	  T::t = 10 * q;
+	  n = q;
+	  f = true;
+	}
+    }
+  if (a != 7 * n || b != 8 * n || t != 10 * n)
+    __builtin_abort ();
+  a = 1;
+  b = 2;
+  R::r = 3;
+  t = 4;
+  #pragma omp parallel private (f)
+    {
+      f = false;
+    #pragma omp single
+    #pragma omp taskloop firstprivate (r, T::t, b, A::a, f)
+      for (int i = 0; i < 30; i++)
+	{
+	  int q = omp_get_thread_num ();
+	  if (!f)
+	    {
+	      if (A::a != 1 || R::r != 3 || A::b != 2 || T::t != 4)
+		__builtin_abort ();
+	    }
+	  else if (a != 7 * q || b != 8 * q || r != 9 * q || t != 10 * q)
+	    __builtin_abort ();
+	  take (a, r, t, b);
+	  A::a = 7 * q;
+	  A::b = 8 * q;
+	  R::r = 9 * q;
+	  T::t = 10 * q;
+	  f = true;
+	}
+    }
+  #pragma omp parallel private (f)
+    {
+      f = false;
+    #pragma omp single
+    #pragma omp taskloop lastprivate (a, t, b, n)
+      for (int i = 0; i < 30; i++)
+	{
+	  int q = omp_get_thread_num ();
+	  if (f && (a != 7 * q || b != 8 * q || r != 9 * q || t != 10 * q))
+	    __builtin_abort ();
+	  take (a, r, t, b);
+	  A::a = 7 * q;
+	  A::b = 8 * q;
+	  R::r = 9 * q;
+	  T::t = 10 * q;
+	  n = q;
+	  f = true;
+	}
+    }
+  if (a != 7 * n || b != 8 * n || t != 10 * n)
+    __builtin_abort ();
+  #pragma omp parallel private (a, T::t, A::b, r)
+    {
+      int q = omp_get_thread_num ();
+      a = q;
+      b = 2 * q;
+      r = 3 * q;
+      t = 4 * q;
+      take (a, b, r, t);
+      #pragma omp single copyprivate (A::a, t, b, R::r)
+	n = q;
+      if (a != n || b != 2 * n || r != 3 * n || t != 4 * n)
+	__builtin_abort ();
+    }
+  a = 0;
+  b = 0;
+  R::r = 0;
+  t = 0;
+  #pragma omp parallel for reduction (+: A::a, t, b, R::r)
+  for (int i = 0; i < 30; i++)
+    {
+      a += i;
+      A::b += 2 * i;
+      r += 3 * i;
+      T::t += 4 * i;
+      take (a, b, r, t);
+    }
+  if (A::a != 435 || b != 2 * 435 || R::r != 3 * 435 || t != 4 * 435)
+    __builtin_abort ();
+}
+
+int
+main ()
+{
+  A a;
+  a.m1 ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/member-2.C b/libgomp/testsuite/libgomp.c++/member-2.C
new file mode 100644
index 00000000000..bb348d8a822
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/member-2.C
@@ -0,0 +1,211 @@
+// { dg-do run }
+
+#include <omp.h>
+
+int c, d, e;
+struct R { R () {}; ~R () {}; int r; };
+template <typename Q>
+struct T { T () : t(d) {}; virtual ~T () {}; Q t; };
+template <typename Q>
+struct A : public R, virtual public T<Q> { A () : b(c), a(e) {} Q a; int &b; void m1 (); };
+
+void
+take (int &a, int &b, int &c, int &d)
+{
+  asm volatile ("" : : "g" (&a), "g" (&b), "g" (&c), "g" (&d) : "memory");
+}
+
+template <typename Q>
+void
+A<Q>::m1 ()
+{
+  #pragma omp parallel private (a, r, T<Q>::t, A::b)
+  {
+    int q = omp_get_thread_num ();
+    a = q;
+    r = 2 * q;
+    T<Q>::t = 3 * q;
+    b = 4 * q;
+    take (a, r, T<Q>::t, b);
+    #pragma omp barrier
+    if (A::a != q || R::r != 2 * q || T<Q>::t != 3 * q || A::b != 4 * q)
+      __builtin_abort ();
+  }
+  a = 7;
+  r = 8;
+  T<Q>::t = 9;
+  b = 10;
+  #pragma omp parallel firstprivate (A::a, R::r, T<Q>::t, b)
+  {
+    int q = omp_get_thread_num ();
+    take (A::a, R::r, T<Q>::t, A::b);
+    if (a != 7 || r != 8 || T<Q>::t != 9 || b != 10)
+      __builtin_abort ();
+    A::a = 5 * q;
+    R::r = 6 * q;
+    T<Q>::t = 7 * q;
+    A::b = 8 * q;
+    take (a, r, T<Q>::t, b);
+    #pragma omp barrier
+    if (a != 5 * q || r != 6 * q || T<Q>::t != 7 * q || b != 8 * q)
+      __builtin_abort ();
+  }
+  bool f = false;
+  a = -5;
+  b = -4;
+  r = -3;
+  T<Q>::t = -2;
+  int n;
+  #pragma omp parallel for firstprivate (a, T<Q>::t, b, f) lastprivate (A::a, r, T<Q>::t, n)
+  for (int i = 0; i < omp_get_num_threads (); i++)
+    {
+      int q = omp_get_thread_num ();
+      if (!f)
+	{
+	  if (A::a != -5 || A::b != -4 || T<Q>::t != -2)
+	    __builtin_abort ();
+	}
+      else if (a != q || b != 2 * q || r != 3 * q || T<Q>::t != 4 * q)
+	__builtin_abort ();
+      take (a, r, T<Q>::t, b);
+      A::a = q;
+      A::b = 2 * q;
+      R::r = 3 * q;
+      T<Q>::t = 4 * q;
+      n = q;
+      f = true;
+    }
+  if (a != n || r != 3 * n || T<Q>::t != 4 * n)
+    __builtin_abort ();
+  b = 8;
+  #pragma omp parallel
+    #pragma omp single
+      for (int i = 0; i < 5; i++)
+	#pragma omp task firstprivate (T<Q>::t, b, n) private (a, R::r)
+	  {
+	    if (T<Q>::t != 4 * n || b != 8)
+	      __builtin_abort ();
+	    a = 9;
+	    r = 8;
+	    T<Q>::t = 12;
+	    b = 18;
+	    take (a, r, T<Q>::t, b);
+	    if (a != 9 || r != 8 || T<Q>::t != 12 || b != 18)
+	      __builtin_abort ();
+	  }
+  a = 1;
+  b = 2;
+  R::r = 3;
+  T<Q>::t = 4;
+  #pragma omp parallel private (f)
+    {
+      f = false;
+    #pragma omp single
+    #pragma omp taskloop firstprivate (r, T<Q>::t, b, f) lastprivate (a, T<Q>::t, b, n)
+      for (int i = 0; i < 30; i++)
+	{
+	  int q = omp_get_thread_num ();
+	  if (!f)
+	    {
+	      if (R::r != 3 || A::b != 2 || T<Q>::t != 4)
+		__builtin_abort ();
+	    }
+	  else if (a != 7 * q || b != 8 * q || r != 9 * q || T<Q>::t != 10 * q)
+	    __builtin_abort ();
+	  take (a, r, T<Q>::t, b);
+	  A::a = 7 * q;
+	  A::b = 8 * q;
+	  R::r = 9 * q;
+	  T<Q>::t = 10 * q;
+	  n = q;
+	  f = true;
+	}
+    }
+  if (a != 7 * n || b != 8 * n || T<Q>::t != 10 * n)
+    __builtin_abort ();
+  a = 1;
+  b = 2;
+  R::r = 3;
+  T<Q>::t = 4;
+  #pragma omp parallel private (f)
+    {
+      f = false;
+    #pragma omp single
+    #pragma omp taskloop firstprivate (r, T<Q>::t, b, A::a, f)
+      for (int i = 0; i < 30; i++)
+	{
+	  int q = omp_get_thread_num ();
+	  if (!f)
+	    {
+	      if (A::a != 1 || R::r != 3 || A::b != 2 || T<Q>::t != 4)
+		__builtin_abort ();
+	    }
+	  else if (a != 7 * q || b != 8 * q || r != 9 * q || T<Q>::t != 10 * q)
+	    __builtin_abort ();
+	  take (a, r, T<Q>::t, b);
+	  A::a = 7 * q;
+	  A::b = 8 * q;
+	  R::r = 9 * q;
+	  T<Q>::t = 10 * q;
+	  f = true;
+	}
+    }
+  #pragma omp parallel private (f)
+    {
+      f = false;
+    #pragma omp single
+    #pragma omp taskloop lastprivate (a, T<Q>::t, b, n)
+      for (int i = 0; i < 30; i++)
+	{
+	  int q = omp_get_thread_num ();
+	  if (f && (a != 7 * q || b != 8 * q || r != 9 * q || T<Q>::t != 10 * q))
+	    __builtin_abort ();
+	  take (a, r, T<Q>::t, b);
+	  A::a = 7 * q;
+	  A::b = 8 * q;
+	  R::r = 9 * q;
+	  T<Q>::t = 10 * q;
+	  n = q;
+	  f = true;
+	}
+    }
+  if (a != 7 * n || b != 8 * n || T<Q>::t != 10 * n)
+    __builtin_abort ();
+  #pragma omp parallel private (a, T<Q>::t, A::b, r)
+    {
+      int q = omp_get_thread_num ();
+      a = q;
+      b = 2 * q;
+      r = 3 * q;
+      T<Q>::t = 4 * q;
+      take (a, b, r, T<Q>::t);
+      #pragma omp single copyprivate (A::a, T<Q>::t, b, R::r)
+	n = q;
+      if (a != n || b != 2 * n || r != 3 * n || T<Q>::t != 4 * n)
+	__builtin_abort ();
+    }
+  a = 0;
+  b = 0;
+  R::r = 0;
+  T<Q>::t = 0;
+  #pragma omp parallel for reduction (+: A::a, T<Q>::t, b, R::r)
+  for (int i = 0; i < 30; i++)
+    {
+      a += i;
+      A::b += 2 * i;
+      r += 3 * i;
+      T<Q>::t += 4 * i;
+      take (a, b, r, T<Q>::t);
+    }
+  if (A::a != 435 || b != 2 * 435 || R::r != 3 * 435 || T<Q>::t != 4 * 435)
+    __builtin_abort ();
+}
+
+int
+main ()
+{
+  A<int> a;
+  a.m1 ();
+  A<int &> b;
+  b.m1 ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/member-3.C b/libgomp/testsuite/libgomp.c++/member-3.C
new file mode 100644
index 00000000000..50bd587d86b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/member-3.C
@@ -0,0 +1,105 @@
+// { dg-do run }
+
+struct R { R () {}; ~R () {}; int r; };
+struct T { T () {}; virtual ~T () {}; int t; };
+int c;
+struct A : public R, virtual public T { A () : b(c) {} int a; int &b; void m1 (); };
+int d[64];
+
+void
+A::m1 ()
+{
+  r = 0;
+  #pragma omp parallel for private (a) reduction(|:R::r)
+  for (a = 0; A::a < 31; a += 2)
+    r |= (1 << A::a);
+  if (r != 0x55555555)
+    __builtin_abort ();
+  #pragma omp parallel for simd linear (R::r)
+  for (R::r = 0; r < 32; R::r++)
+    d[r + 8] |= 1;
+  for (int i = 0; i < 64; i++)
+    if (d[i] != ((i >= 8 && i < 32 + 8) ? 1 : 0))
+      __builtin_abort ();
+  #pragma omp parallel for lastprivate (t)
+  for (T::t = 0; t < 32; t += 3)
+    d[T::t + 2] |= 2;
+  if (T::t != 33)
+    __builtin_abort ();
+  for (int i = 0; i < 64; i++)
+    if (d[i] != (((i >= 8 && i < 32 + 8) ? 1 : 0)
+		 | ((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 2 : 0)))
+      __builtin_abort ();
+  #pragma omp simd linear (t)
+  for (t = 0; t < 32; t++)
+    d[T::t + 9] |= 4;
+  if (t != 32)
+    __builtin_abort ();
+  for (int i = 0; i < 64; i++)
+    if (d[i] != (((i >= 8 && i < 32 + 8) ? 1 : 0)
+		 | ((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 2 : 0)
+		 | ((i >= 9 && i < 32 + 9) ? 4 : 0)))
+      __builtin_abort ();
+  r = 0;
+  #pragma omp parallel for reduction(|:r)
+  for (a = 0; A::a < 31; a += 2)
+    r |= (1 << A::a);
+  if (r != 0x55555555)
+    __builtin_abort ();
+  #pragma omp parallel for simd
+  for (R::r = 0; r < 32; R::r += 2)
+    d[r + 8] |= 8;
+  for (int i = 0; i < 64; i++)
+    if (d[i] != (((i >= 8 && i < 32 + 8) ? ((i & 1) ? 1 : (8 | 1)) : 0)
+		 | ((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 2 : 0)
+		 | ((i >= 9 && i < 32 + 9) ? 4 : 0)))
+      __builtin_abort ();
+  #pragma omp simd collapse(2)
+  for (T::t = 0; t < 7; t += 2)
+    for (a = 0; A::a < 8; a++)
+      d[((t << 2) | a) + 3] |= 16;
+  if (t != 8 || A::a != 8)
+    __builtin_abort ();
+  for (int i = 0; i < 64; i++)
+    if (d[i] != (((i >= 8 && i < 32 + 8) ? ((i & 1) ? 1 : (8 | 1)) : 0)
+		 | ((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 2 : 0)
+		 | ((i >= 9 && i < 32 + 9) ? 4 : 0)
+		 | ((i >= 3 && i < 32 + 3) ? 16 : 0)))
+      __builtin_abort ();
+  T::t = 32;
+  a = 16;
+  #pragma omp parallel
+  #pragma omp single
+  #pragma omp taskloop simd collapse(2)
+  for (t = 0; T::t < 7; T::t += 2)
+    for (A::a = 0; a < 8; A::a++)
+      d[((t << 2) | A::a) + 3] |= 32;
+  if (T::t != 8 || a != 8)
+    __builtin_abort ();
+  for (int i = 0; i < 64; i++)
+    if (d[i] != (((i >= 8 && i < 32 + 8) ? ((i & 1) ? 1 : (8 | 1)) : 0)
+		 | ((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 2 : 0)
+		 | ((i >= 9 && i < 32 + 9) ? 4 : 0)
+		 | ((i >= 3 && i < 32 + 3) ? (16 | 32) : 0)))
+      __builtin_abort ();
+  #pragma omp parallel
+  #pragma omp single
+  #pragma omp taskloop simd
+  for (R::r = 0; r < 31; R::r += 2)
+    d[r + 8] |= 64;
+  if (r != 32)
+    __builtin_abort ();
+  for (int i = 0; i < 64; i++)
+    if (d[i] != (((i >= 8 && i < 32 + 8) ? ((i & 1) ? 1 : (64 | 8 | 1)) : 0)
+		 | ((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 2 : 0)
+		 | ((i >= 9 && i < 32 + 9) ? 4 : 0)
+		 | ((i >= 3 && i < 32 + 3) ? (16 | 32) : 0)))
+      __builtin_abort ();
+}
+
+int
+main ()
+{
+  A a;
+  a.m1 ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/member-4.C b/libgomp/testsuite/libgomp.c++/member-4.C
new file mode 100644
index 00000000000..f76695de6fb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/member-4.C
@@ -0,0 +1,108 @@
+// { dg-do run }
+
+int c, d, e;
+struct R { R () {}; ~R () {}; int r; };
+template <typename Q>
+struct T { T () : t(d) {}; virtual ~T () {}; Q t; };
+template <typename Q>
+struct A : public R, virtual public T<Q> { A () : b(c), a(e) {} Q a; int &b; void m1 (); };
+int f[64];
+
+template <typename Q>
+void
+A<Q>::m1 ()
+{
+  r = 0;
+  #pragma omp parallel for private (a) reduction(|:R::r)
+  for (a = 0; A::a < 31; a += 2)
+    r |= (1 << A::a);
+  if (r != 0x55555555)
+    __builtin_abort ();
+  #pragma omp parallel for simd linear (R::r)
+  for (R::r = 0; r < 32; R::r++)
+    f[r + 8] |= 1;
+  for (int i = 0; i < 64; i++)
+    if (f[i] != ((i >= 8 && i < 32 + 8) ? 1 : 0))
+      __builtin_abort ();
+  #pragma omp parallel for lastprivate (T<Q>::t)
+  for (T<Q>::t = 0; T<Q>::t < 32; T<Q>::t += 3)
+    f[T<Q>::t + 2] |= 2;
+  if (T<Q>::t != 33)
+    __builtin_abort ();
+  for (int i = 0; i < 64; i++)
+    if (f[i] != (((i >= 8 && i < 32 + 8) ? 1 : 0)
+		 | ((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 2 : 0)))
+      __builtin_abort ();
+  #pragma omp simd linear (T<Q>::t)
+  for (T<Q>::t = 0; T<Q>::t < 32; T<Q>::t++)
+    f[T<Q>::t + 9] |= 4;
+  if (T<Q>::t != 32)
+    __builtin_abort ();
+  for (int i = 0; i < 64; i++)
+    if (f[i] != (((i >= 8 && i < 32 + 8) ? 1 : 0)
+		 | ((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 2 : 0)
+		 | ((i >= 9 && i < 32 + 9) ? 4 : 0)))
+      __builtin_abort ();
+  r = 0;
+  #pragma omp parallel for reduction(|:r)
+  for (a = 0; A::a < 31; a += 2)
+    r |= (1 << A::a);
+  if (r != 0x55555555)
+    __builtin_abort ();
+  #pragma omp parallel for simd
+  for (R::r = 0; r < 32; R::r += 2)
+    f[r + 8] |= 8;
+  for (int i = 0; i < 64; i++)
+    if (f[i] != (((i >= 8 && i < 32 + 8) ? ((i & 1) ? 1 : (8 | 1)) : 0)
+		 | ((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 2 : 0)
+		 | ((i >= 9 && i < 32 + 9) ? 4 : 0)))
+      __builtin_abort ();
+  #pragma omp simd collapse(2)
+  for (T<Q>::t = 0; T<Q>::t < 7; T<Q>::t += 2)
+    for (a = 0; A::a < 8; a++)
+      f[((T<Q>::t << 2) | a) + 3] |= 16;
+  if (T<Q>::t != 8 || A::a != 8)
+    __builtin_abort ();
+  for (int i = 0; i < 64; i++)
+    if (f[i] != (((i >= 8 && i < 32 + 8) ? ((i & 1) ? 1 : (8 | 1)) : 0)
+		 | ((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 2 : 0)
+		 | ((i >= 9 && i < 32 + 9) ? 4 : 0)
+		 | ((i >= 3 && i < 32 + 3) ? 16 : 0)))
+      __builtin_abort ();
+  T<Q>::t = 32;
+  a = 16;
+  #pragma omp parallel
+  #pragma omp single
+  #pragma omp taskloop simd collapse(2)
+  for (T<Q>::t = 0; T<Q>::t < 7; T<Q>::t += 2)
+    for (A::a = 0; a < 8; A::a++)
+      f[((T<Q>::t << 2) | A::a) + 3] |= 32;
+  if (T<Q>::t != 8 || a != 8)
+    __builtin_abort ();
+  for (int i = 0; i < 64; i++)
+    if (f[i] != (((i >= 8 && i < 32 + 8) ? ((i & 1) ? 1 : (8 | 1)) : 0)
+		 | ((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 2 : 0)
+		 | ((i >= 9 && i < 32 + 9) ? 4 : 0)
+		 | ((i >= 3 && i < 32 + 3) ? (16 | 32) : 0)))
+      __builtin_abort ();
+  #pragma omp parallel
+  #pragma omp single
+  #pragma omp taskloop simd
+  for (R::r = 0; r < 31; R::r += 2)
+    f[r + 8] |= 64;
+  if (r != 32)
+    __builtin_abort ();
+  for (int i = 0; i < 64; i++)
+    if (f[i] != (((i >= 8 && i < 32 + 8) ? ((i & 1) ? 1 : (64 | 8 | 1)) : 0)
+		 | ((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 2 : 0)
+		 | ((i >= 9 && i < 32 + 9) ? 4 : 0)
+		 | ((i >= 3 && i < 32 + 3) ? (16 | 32) : 0)))
+      __builtin_abort ();
+}
+
+int
+main ()
+{
+  A<int> a;
+  a.m1 ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/member-5.C b/libgomp/testsuite/libgomp.c++/member-5.C
new file mode 100644
index 00000000000..d6fec7a841a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/member-5.C
@@ -0,0 +1,183 @@
+// { dg-do run }
+
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+
+template <typename T>
+class I
+{
+public:
+  typedef ptrdiff_t difference_type;
+  I ();
+  ~I ();
+  I (T *);
+  I (const I &);
+  T &operator * ();
+  T *operator -> ();
+  T &operator [] (const difference_type &) const;
+  I &operator = (const I &);
+  I &operator ++ ();
+  I operator ++ (int);
+  I &operator -- ();
+  I operator -- (int);
+  I &operator += (const difference_type &);
+  I &operator -= (const difference_type &);
+  I operator + (const difference_type &) const;
+  I operator - (const difference_type &) const;
+  template <typename S> friend bool operator == (I<S> &, I<S> &);
+  template <typename S> friend bool operator == (const I<S> &, const I<S> &);
+  template <typename S> friend bool operator < (I<S> &, I<S> &);
+  template <typename S> friend bool operator < (const I<S> &, const I<S> &);
+  template <typename S> friend bool operator <= (I<S> &, I<S> &);
+  template <typename S> friend bool operator <= (const I<S> &, const I<S> &);
+  template <typename S> friend bool operator > (I<S> &, I<S> &);
+  template <typename S> friend bool operator > (const I<S> &, const I<S> &);
+  template <typename S> friend bool operator >= (I<S> &, I<S> &);
+  template <typename S> friend bool operator >= (const I<S> &, const I<S> &);
+  template <typename S> friend typename I<S>::difference_type operator - (I<S> &, I<S> &);
+  template <typename S> friend typename I<S>::difference_type operator - (const I<S> &, const I<S> &);
+  template <typename S> friend I<S> operator + (typename I<S>::difference_type , const I<S> &);
+private:
+  T *p;
+};
+template <typename T> I<T>::I () : p (0) {}
+template <typename T> I<T>::~I () {}
+template <typename T> I<T>::I (T *x) : p (x) {}
+template <typename T> I<T>::I (const I &x) : p (x.p) {}
+template <typename T> T &I<T>::operator * () { return *p; }
+template <typename T> T *I<T>::operator -> () { return p; }
+template <typename T> T &I<T>::operator [] (const difference_type &x) const { return p[x]; }
+template <typename T> I<T> &I<T>::operator = (const I &x) { p = x.p; return *this; }
+template <typename T> I<T> &I<T>::operator ++ () { ++p; return *this; }
+template <typename T> I<T> I<T>::operator ++ (int) { return I (p++); }
+template <typename T> I<T> &I<T>::operator -- () { --p; return *this; }
+template <typename T> I<T> I<T>::operator -- (int) { return I (p--); }
+template <typename T> I<T> &I<T>::operator += (const difference_type &x) { p += x; return *this; }
+template <typename T> I<T> &I<T>::operator -= (const difference_type &x) { p -= x; return *this; }
+template <typename T> I<T> I<T>::operator + (const difference_type &x) const { return I (p + x); }
+template <typename T> I<T> I<T>::operator - (const difference_type &x) const { return I (p - x); }
+template <typename T> bool operator == (I<T> &x, I<T> &y) { return x.p == y.p; }
+template <typename T> bool operator == (const I<T> &x, const I<T> &y) { return x.p == y.p; }
+template <typename T> bool operator != (I<T> &x, I<T> &y) { return !(x == y); }
+template <typename T> bool operator != (const I<T> &x, const I<T> &y) { return !(x == y); }
+template <typename T> bool operator < (I<T> &x, I<T> &y) { return x.p < y.p; }
+template <typename T> bool operator < (const I<T> &x, const I<T> &y) { return x.p < y.p; }
+template <typename T> bool operator <= (I<T> &x, I<T> &y) { return x.p <= y.p; }
+template <typename T> bool operator <= (const I<T> &x, const I<T> &y) { return x.p <= y.p; }
+template <typename T> bool operator > (I<T> &x, I<T> &y) { return x.p > y.p; }
+template <typename T> bool operator > (const I<T> &x, const I<T> &y) { return x.p > y.p; }
+template <typename T> bool operator >= (I<T> &x, I<T> &y) { return x.p >= y.p; }
+template <typename T> bool operator >= (const I<T> &x, const I<T> &y) { return x.p >= y.p; }
+template <typename T> typename I<T>::difference_type operator - (I<T> &x, I<T> &y) { return x.p - y.p; }
+template <typename T> typename I<T>::difference_type operator - (const I<T> &x, const I<T> &y) { return x.p - y.p; }
+template <typename T> I<T> operator + (typename I<T>::difference_type x, const I<T> &y) { return I<T> (x + y.p); }
+
+struct R { R () {}; ~R () {}; I<int> r; };
+struct T { T () {}; virtual ~T () {}; I<int> t; };
+struct A : public R, virtual public T { A () {} I<int> a; void m1 (const I<int> &, const I<int> &); };
+template <typename Q>
+struct U { U () {}; virtual ~U () {}; Q t; };
+template <typename Q>
+struct B : public R, virtual public U<Q> { B () {} Q a; void m2 (const Q &, const Q &, const I<int> &, const I<int> &); };
+
+int d[64];
+
+void
+A::m1 (const I<int> &x, const I<int> &y)
+{
+  int w = 0;
+  #pragma omp parallel for private (a) reduction(|:w)
+  for (a = x; A::a < y - 33; a += 2)
+    w |= (1 << *A::a);
+  if (w != 0x55555555)
+    __builtin_abort ();
+  #pragma omp parallel for lastprivate (t)
+  for (T::t = x; t < y - 32; t += 3)
+    d[*T::t + 2] |= 1;
+  if (*T::t != 33)
+    __builtin_abort ();
+  for (int i = 0; i < 64; i++)
+    if (d[i] != ((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 1 : 0))
+      __builtin_abort ();
+  w = 0;
+  #pragma omp parallel for reduction(|:w)
+  for (a = x; A::a < y - 33; a += 2)
+    w |= (1 << *A::a);
+  if (w != 0x55555555)
+    __builtin_abort ();
+  #pragma omp taskloop
+  for (R::r = x; r < y - 32; R::r += 2)
+    d[*r + 8] |= 2;
+  for (int i = 0; i < 64; i++)
+    if (d[i] != (((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 1 : 0)
+		 | ((i >= 8 && i < 32 + 8 && (i & 1) == 0) ? 2 : 0)))
+      __builtin_abort ();
+  #pragma omp taskloop collapse(2)
+  for (T::t = x; t < y - 57; t += 2)
+    for (a = x; A::a < y - 56; a++)
+      d[((*t << 2) | *a) + 3] |= 4;
+  for (int i = 0; i < 64; i++)
+    if (d[i] != (((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 1 : 0)
+		 | ((i >= 8 && i < 32 + 8 && (i & 1) == 0) ? 2 : 0)
+		 | ((i >= 3 && i < 32 + 3) ? 4 : 0)))
+      __builtin_abort ();
+}
+
+template <typename Q>
+void
+B<Q>::m2 (const Q &u, const Q &v, const I<int> &x, const I<int> &y)
+{
+  int w = 0;
+  #pragma omp parallel for private (a) reduction(|:w)
+  for (a = u; B::a < v - 33; a += 2)
+    w |= (1 << *B::a);
+  if (w != 0x55555555)
+    __builtin_abort ();
+  #pragma omp parallel for lastprivate (U<Q>::t)
+  for (U<Q>::t = u; U<Q>::t < v - 32; U<Q>::t += 3)
+    d[*U<Q>::t + 2] |= 1;
+  if (*U<Q>::t != 33)
+    __builtin_abort ();
+  for (int i = 0; i < 64; i++)
+    if (d[i] != ((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 1 : 0))
+      __builtin_abort ();
+  w = 0;
+  #pragma omp parallel for reduction(|:w)
+  for (a = u; B::a < v - 33; a += 2)
+    w |= (1 << *B::a);
+  if (w != 0x55555555)
+    __builtin_abort ();
+  #pragma omp taskloop
+  for (R::r = x; r < y - 32; R::r += 2)
+    d[*r + 8] |= 2;
+  for (int i = 0; i < 64; i++)
+    if (d[i] != (((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 1 : 0)
+		 | ((i >= 8 && i < 32 + 8 && (i & 1) == 0) ? 2 : 0)))
+      __builtin_abort ();
+  #pragma omp taskloop collapse(2)
+  for (U<Q>::t = u; U<Q>::t < v - 57; U<Q>::t += 2)
+    for (a = u; B::a < v - 56; a++)
+      d[((*U<Q>::t << 2) | *a) + 3] |= 4;
+  for (int i = 0; i < 64; i++)
+    if (d[i] != (((i >= 2 && i < 32 + 2 && (i - 2) % 3 == 0) ? 1 : 0)
+		 | ((i >= 8 && i < 32 + 8 && (i & 1) == 0) ? 2 : 0)
+		 | ((i >= 3 && i < 32 + 3) ? 4 : 0)))
+      __builtin_abort ();
+}
+
+int
+main ()
+{
+  A a;
+  int b[128];
+  for (int i = 0; i < 128; i++)
+    b[i] = i - 32;
+  a.m1 (&b[32], &b[96]);
+  for (int i = 0; i < 64; i++)
+    d[i] = 0;
+  B<I<int> > c;
+  c.m2 (&b[32], &b[96], &b[32], &b[96]);
+  for (int i = 0; i < 64; i++)
+    d[i] = 0;
+  B<int *> d;
+  d.m2 (&b[32], &b[96], &b[32], &b[96]);
+}
diff --git a/libgomp/testsuite/libgomp.c++/ordered-1.C b/libgomp/testsuite/libgomp.c++/ordered-1.C
new file mode 100644
index 00000000000..a1bedd808ac
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/ordered-1.C
@@ -0,0 +1 @@
+#include "../libgomp.c/ordered-4.c"
diff --git a/libgomp/testsuite/libgomp.c++/reduction-10.C b/libgomp/testsuite/libgomp.c++/reduction-10.C
new file mode 100644
index 00000000000..2254430f168
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/reduction-10.C
@@ -0,0 +1,201 @@
+template <typename T>
+struct A
+{
+  A () { t = 0; }
+  A (T x) { t = x; }
+  A (const A &x) { t = x.t; }
+  ~A () {}
+  T t;
+};
+template <typename T>
+struct M
+{
+  M () { t = 1; }
+  M (T x) { t = x; }
+  M (const M &x) { t = x.t; }
+  ~M () {}
+  T t;
+};
+template <typename T>
+struct B
+{
+  B () { t = ~(T) 0; }
+  B (T x) { t = x; }
+  B (const B &x) { t = x.t; }
+  ~B () {}
+  T t;
+};
+template <typename T>
+void
+add (T &x, T &y)
+{
+  x.t += y.t;
+}
+template <typename T>
+void
+zero (T &x)
+{
+  x.t = 0;
+}
+template <typename T>
+void
+orit (T *x, T *y)
+{
+  y->t |= x->t;
+}
+B<long> bb;
+#pragma omp declare reduction(+:A<int>:omp_out.t += omp_in.t)
+#pragma omp declare reduction(+:A<char>:add (omp_out, omp_in)) initializer(zero (omp_priv))
+#pragma omp declare reduction(*:M<int>:omp_out.t *= omp_in.t) initializer(omp_priv = 1)
+#pragma omp declare reduction(|:A<unsigned long long>:orit (&omp_in, &omp_out))
+#pragma omp declare reduction(&:B<long>:omp_out.t = omp_out.t & omp_in.t) initializer(orit (&omp_priv, &omp_orig))
+#pragma omp declare reduction(maxb:short:omp_out = omp_in > omp_out ? omp_in : omp_out) initializer(omp_priv = -6)
+
+A<char> z[10];
+
+template <int N>
+__attribute__((noinline, noclone)) void
+foo (A<int> (*&x)[3][N], M<int> *y, B<long> (&w)[1][N], int p1, long p2, long p3, int p4,
+     int p5, long p6, short p7)
+{
+  A<unsigned long long> a[p7 + 4];
+  short bb[p7];
+  short (&b)[p7] = bb;
+  for (int i = 0; i < p7; i++)
+    bb[i] = -6;
+  #pragma omp parallel for reduction(+:x[0:p1 + 1][:p2 + N - 2], z[:p3]) \
+			   reduction(*:y[:p4]) reduction(|:a[:p5 - N + 2]) \
+			   reduction(&:w[0:p6 - 3 + N][:p6]) reduction(maxb:b)
+  for (int i = 0; i < 128; i++)
+    {
+      x[i / 64][i % 3][(i / 4) & 1].t += i;
+      if ((i & 15) == 1)
+	y[0].t *= 3;
+      if ((i & 31) == N)
+	y[1].t *= 7;
+      if ((i & 63) == 3)
+	y[N].t *= 17;
+      z[i / 32].t += (i & 3);
+      if (i < 4)
+	z[i].t += i;
+      a[i / 32].t |= 1ULL << (i & 30);
+      w[0][i & 1].t &= ~(1L << (i / 17 * 3));
+      if ((i % 79) > b[0])
+	b[0] = i % 79;
+      if ((i % 13) > b[1])
+	b[1] = i % 13;
+      if ((i % 23) > b[N])
+	b[N] = i % 23;
+      if ((i % 85) > b[3])
+	b[3] = i % 85;
+      if ((i % 192) > b[4])
+	b[4] = i % 192;
+    }
+  for (int i = 0; i < 9; i++)
+    if (a[i].t != (i < 4 ? 0x55555555ULL : 0))
+      __builtin_abort ();
+  if (bb[0] != 78 || bb[1] != 12 || bb[N] != 22 || bb[3] != 84 || bb[4] != 127)
+    __builtin_abort ();
+}
+
+A<int> a3[4][3][2];
+A<int> (*p3)[3][2] = &a3[1];
+M<int> y3[5] = { 0, 1, 1, 1, 0 };
+B<long> w3[1][2];
+
+template <int N>
+struct S
+{
+  A<int> (*&x)[3][N];
+  M<int> *y;
+  B<long> (&w)[1][N];
+  A<char> z[10];
+  short b[5];
+  A<unsigned long long> a[9];
+  S() : x(p3), y(y3+1), w(w3), z(), a(), b() {}
+  __attribute__((noinline, noclone)) void foo (int, long, long, int, int, long, short);
+};
+
+template <int N>
+void
+S<N>::foo (int p1, long p2, long p3, int p4, int p5, long p6, short p7)
+{
+  #pragma omp parallel for reduction(+:x[0:p1 + 1][:p2][0:N], z[:p3 + N - 2]) \
+			   reduction(*:y[:p4]) reduction(|:a[:p5]) \
+			   reduction(&:w[0:p6 - 3 + N][:p6]) reduction(maxb:b)
+  for (int i = 0; i < 128; i++)
+    {
+      x[i / 64][i % 3][(i / 4) & 1].t += i;
+      if ((i & 15) == 1)
+	y[0].t *= 3;
+      if ((i & 31) == N)
+	y[1].t *= 7;
+      if ((i & 63) == 3)
+	y[N].t *= 17;
+      z[i / 32].t += (i & 3);
+      if (i < 4)
+	z[i].t += i;
+      a[i / 32].t |= 1ULL << (i & 30);
+      w[0][i & 1].t &= ~(1L << (i / 17 * 3));
+      if ((i % 79) > b[0])
+	b[0] = i % 79;
+      if ((i % 13) > b[1])
+	b[1] = i % 13;
+      if ((i % 23) > b[N])
+	b[N] = i % 23;
+      if ((i % 85) > b[3])
+	b[3] = i % 85;
+      if ((i % 192) > b[4])
+	b[4] = i % 192;
+    }
+}
+
+int
+main ()
+{
+  A<int> a[4][3][2];
+  static int a2[4][3][2] = {{{ 0, 0 }, { 0, 0 }, { 0, 0 }},
+			    {{ 312, 381 }, { 295, 356 }, { 337, 335 }},
+			    {{ 1041, 975 }, { 1016, 1085 }, { 935, 1060 }},
+			    {{ 0, 0 }, { 0, 0 }, { 0, 0 }}};
+  A<int> (*p)[3][2] = &a[1];
+  M<int> y[5] = { 0, 1, 1, 1, 0 };
+  int y2[5] = { 0, 6561, 2401, 289, 0 };
+  char z2[10] = { 48, 49, 50, 51, 0, 0, 0, 0, 0, 0 };
+  B<long> w[1][2];
+  foo<2> (p, y + 1, w, 1, 3L, 4L, 3, 4, 2L, 5);
+  for (int i = 0; i < 4; i++)
+    for (int j = 0; j < 3; j++)
+      for (int k = 0; k < 2; k++)
+	if (a[i][j][k].t != a2[i][j][k])
+	  __builtin_abort ();
+  for (int i = 0; i < 5; i++)
+    if (y[i].t != y2[i])
+      __builtin_abort ();
+  for (int i = 0; i < 10; i++)
+    if (z[i].t != z2[i])
+      __builtin_abort ();
+  if (w[0][0].t != ~0x249249L || w[0][1].t != ~0x249249L)
+    __builtin_abort ();
+  S<2> s;
+  s.foo (1, 3L, 4L, 3, 4, 2L, 5);
+  for (int i = 0; i < 9; i++)
+    if (s.a[i].t != (i < 4 ? 0x55555555ULL : 0))
+      __builtin_abort ();
+  for (int i = 0; i < 4; i++)
+    for (int j = 0; j < 3; j++)
+      for (int k = 0; k < 2; k++)
+	if (a3[i][j][k].t != a2[i][j][k])
+	  __builtin_abort ();
+  for (int i = 0; i < 5; i++)
+    if (y3[i].t != y2[i])
+      __builtin_abort ();
+  for (int i = 0; i < 10; i++)
+    if (s.z[i].t != z2[i])
+      __builtin_abort ();
+  if (w3[0][0].t != ~0x249249L || w3[0][1].t != ~0x249249L)
+    __builtin_abort ();
+  if (s.b[0] != 78 || s.b[1] != 12 || s.b[2] != 22
+      || s.b[3] != 84 || s.b[4] != 127)
+    __builtin_abort ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/reduction-5.C b/libgomp/testsuite/libgomp.c++/reduction-5.C
new file mode 100644
index 00000000000..212fd69be58
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/reduction-5.C
@@ -0,0 +1,127 @@
+char z[10] = { 0 };
+
+__attribute__((noinline, noclone)) void
+foo (int (*&x)[3][2], int *y, long (&w)[1][2])
+{
+  unsigned long long a[9] = {};
+  short b[5] = {};
+  #pragma omp parallel for reduction(+:x[0:2][:][0:2], z[:4]) \
+			   reduction(*:y[:3]) reduction(|:a[:4]) \
+			   reduction(&:w[0:][:2]) reduction(max:b)
+  for (int i = 0; i < 128; i++)
+    {
+      x[i / 64][i % 3][(i / 4) & 1] += i;
+      if ((i & 15) == 1)
+	y[0] *= 3;
+      if ((i & 31) == 2)
+	y[1] *= 7;
+      if ((i & 63) == 3)
+	y[2] *= 17;
+      z[i / 32] += (i & 3);
+      if (i < 4)
+	z[i] += i;
+      a[i / 32] |= 1ULL << (i & 30);
+      w[0][i & 1] &= ~(1L << (i / 17 * 3));
+      if ((i % 79) > b[0])
+	b[0] = i % 79;
+      if ((i % 13) > b[1])
+	b[1] = i % 13;
+      if ((i % 23) > b[2])
+	b[2] = i % 23;
+      if ((i % 85) > b[3])
+	b[3] = i % 85;
+      if ((i % 192) > b[4])
+	b[4] = i % 192;
+    }
+  for (int i = 0; i < 9; i++)
+    if (a[i] != (i < 4 ? 0x55555555ULL : 0))
+      __builtin_abort ();
+  if (b[0] != 78 || b[1] != 12 || b[2] != 22 || b[3] != 84 || b[4] != 127)
+    __builtin_abort ();
+}
+
+int a3[4][3][2];
+int (*p3)[3][2] = &a3[1];
+int y3[5] = { 0, 1, 1, 1, 0 };
+long w3[1][2] = { ~0L, ~0L };
+short bb[5];
+
+struct S
+{
+  int (*&x)[3][2];
+  int *y;
+  long (&w)[1][2];
+  char z[10];
+  short (&b)[5];
+  unsigned long long a[9];
+  S() : x(p3), y(y3+1), w(w3), z(), a(), b(bb) {}
+  __attribute__((noinline, noclone)) void foo ();
+};
+
+void
+S::foo ()
+{
+  #pragma omp parallel for reduction(+:x[0:2][:][0:2], z[:4]) \
+			   reduction(*:y[:3]) reduction(|:a[:4]) \
+			   reduction(&:w[0:][:2]) reduction(max:b)
+  for (int i = 0; i < 128; i++)
+    {
+      x[i / 64][i % 3][(i / 4) & 1] += i;
+      if ((i & 15) == 1)
+	y[0] *= 3;
+      if ((i & 31) == 2)
+	y[1] *= 7;
+      if ((i & 63) == 3)
+	y[2] *= 17;
+      z[i / 32] += (i & 3);
+      if (i < 4)
+	z[i] += i;
+      a[i / 32] |= 1ULL << (i & 30);
+      w[0][i & 1] &= ~(1L << (i / 17 * 3));
+      if ((i % 79) > b[0])
+	b[0] = i % 79;
+      if ((i % 13) > b[1])
+	b[1] = i % 13;
+      if ((i % 23) > b[2])
+	b[2] = i % 23;
+      if ((i % 85) > b[3])
+	b[3] = i % 85;
+      if ((i % 192) > b[4])
+	b[4] = i % 192;
+    }
+}
+
+int
+main ()
+{
+  int a[4][3][2] = {};
+  static int a2[4][3][2] = {{{ 0, 0 }, { 0, 0 }, { 0, 0 }},
+			    {{ 312, 381 }, { 295, 356 }, { 337, 335 }},
+			    {{ 1041, 975 }, { 1016, 1085 }, { 935, 1060 }},
+			    {{ 0, 0 }, { 0, 0 }, { 0, 0 }}};
+  int (*p)[3][2] = &a[1];
+  int y[5] = { 0, 1, 1, 1, 0 };
+  int y2[5] = { 0, 6561, 2401, 289, 0 };
+  char z2[10] = { 48, 49, 50, 51, 0, 0, 0, 0, 0, 0 };
+  long w[1][2] = { ~0L, ~0L };
+  foo (p, y + 1, w);
+  if (__builtin_memcmp (a, a2, sizeof (a))
+      || __builtin_memcmp (y, y2, sizeof (y))
+      || __builtin_memcmp (z, z2, sizeof (z))
+      || w[0][0] != ~0x249249L
+      || w[0][1] != ~0x249249L)
+    __builtin_abort ();
+  S s;
+  s.foo ();
+  for (int i = 0; i < 9; i++)
+    if (s.a[i] != (i < 4 ? 0x55555555ULL : 0))
+      __builtin_abort ();
+  if (__builtin_memcmp (a3, a2, sizeof (a3))
+      || __builtin_memcmp (y3, y2, sizeof (y3))
+      || __builtin_memcmp (s.z, z2, sizeof (s.z))
+      || w3[0][0] != ~0x249249L
+      || w3[0][1] != ~0x249249L)
+    __builtin_abort ();
+  if (bb[0] != 78 || bb[1] != 12 || bb[2] != 22 || bb[3] != 84 || bb[4] != 127)
+    __builtin_abort ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/reduction-6.C b/libgomp/testsuite/libgomp.c++/reduction-6.C
new file mode 100644
index 00000000000..f180ca35edd
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/reduction-6.C
@@ -0,0 +1,195 @@
+template <typename T>
+struct A
+{
+  A () { t = 0; }
+  A (T x) { t = x; }
+  A (const A &x) { t = x.t; }
+  ~A () {}
+  T t;
+};
+template <typename T>
+struct M
+{
+  M () { t = 1; }
+  M (T x) { t = x; }
+  M (const M &x) { t = x.t; }
+  ~M () {}
+  T t;
+};
+template <typename T>
+struct B
+{
+  B () { t = ~(T) 0; }
+  B (T x) { t = x; }
+  B (const B &x) { t = x.t; }
+  ~B () {}
+  T t;
+};
+template <typename T>
+void
+add (T &x, T &y)
+{
+  x.t += y.t;
+}
+template <typename T>
+void
+zero (T &x)
+{
+  x.t = 0;
+}
+template <typename T>
+void
+orit (T *x, T *y)
+{
+  y->t |= x->t;
+}
+B<long> bb;
+#pragma omp declare reduction(+:A<int>:omp_out.t += omp_in.t)
+#pragma omp declare reduction(+:A<char>:add (omp_out, omp_in)) initializer(zero (omp_priv))
+#pragma omp declare reduction(*:M<int>:omp_out.t *= omp_in.t) initializer(omp_priv = 1)
+#pragma omp declare reduction(|:A<unsigned long long>:orit (&omp_in, &omp_out))
+#pragma omp declare reduction(&:B<long>:omp_out.t = omp_out.t & omp_in.t) initializer(orit (&omp_priv, &omp_orig))
+#pragma omp declare reduction(maxb:short:omp_out = omp_in > omp_out ? omp_in : omp_out) initializer(omp_priv = -6)
+
+A<char> z[10];
+
+__attribute__((noinline, noclone)) void
+foo (A<int> (*&x)[3][2], M<int> *y, B<long> (&w)[1][2])
+{
+  A<unsigned long long> a[9];
+  short bb[5] = {};
+  short (&b)[5] = bb;
+  #pragma omp parallel for reduction(+:x[0:2][:][0:2], z[:4]) \
+			   reduction(*:y[:3]) reduction(|:a[:4]) \
+			   reduction(&:w[0:][:2]) reduction(maxb:b)
+  for (int i = 0; i < 128; i++)
+    {
+      x[i / 64][i % 3][(i / 4) & 1].t += i;
+      if ((i & 15) == 1)
+	y[0].t *= 3;
+      if ((i & 31) == 2)
+	y[1].t *= 7;
+      if ((i & 63) == 3)
+	y[2].t *= 17;
+      z[i / 32].t += (i & 3);
+      if (i < 4)
+	z[i].t += i;
+      a[i / 32].t |= 1ULL << (i & 30);
+      w[0][i & 1].t &= ~(1L << (i / 17 * 3));
+      if ((i % 79) > b[0])
+	b[0] = i % 79;
+      if ((i % 13) > b[1])
+	b[1] = i % 13;
+      if ((i % 23) > b[2])
+	b[2] = i % 23;
+      if ((i % 85) > b[3])
+	b[3] = i % 85;
+      if ((i % 192) > b[4])
+	b[4] = i % 192;
+    }
+  for (int i = 0; i < 9; i++)
+    if (a[i].t != (i < 4 ? 0x55555555ULL : 0))
+      __builtin_abort ();
+  if (bb[0] != 78 || bb[1] != 12 || bb[2] != 22 || bb[3] != 84 || bb[4] != 127)
+    __builtin_abort ();
+}
+
+A<int> a3[4][3][2];
+A<int> (*p3)[3][2] = &a3[1];
+M<int> y3[5] = { 0, 1, 1, 1, 0 };
+B<long> w3[1][2];
+
+struct S
+{
+  A<int> (*&x)[3][2];
+  M<int> *y;
+  B<long> (&w)[1][2];
+  A<char> z[10];
+  short b[5];
+  A<unsigned long long> a[9];
+  S() : x(p3), y(y3+1), w(w3), z(), a(), b() {}
+  __attribute__((noinline, noclone)) void foo ();
+};
+
+void
+S::foo ()
+{
+  #pragma omp parallel for reduction(+:x[0:2][:][0:2], z[:4]) \
+			   reduction(*:y[:3]) reduction(|:a[:4]) \
+			   reduction(&:w[0:][:2]) reduction(maxb:b)
+  for (int i = 0; i < 128; i++)
+    {
+      x[i / 64][i % 3][(i / 4) & 1].t += i;
+      if ((i & 15) == 1)
+	y[0].t *= 3;
+      if ((i & 31) == 2)
+	y[1].t *= 7;
+      if ((i & 63) == 3)
+	y[2].t *= 17;
+      z[i / 32].t += (i & 3);
+      if (i < 4)
+	z[i].t += i;
+      a[i / 32].t |= 1ULL << (i & 30);
+      w[0][i & 1].t &= ~(1L << (i / 17 * 3));
+      if ((i % 79) > b[0])
+	b[0] = i % 79;
+      if ((i % 13) > b[1])
+	b[1] = i % 13;
+      if ((i % 23) > b[2])
+	b[2] = i % 23;
+      if ((i % 85) > b[3])
+	b[3] = i % 85;
+      if ((i % 192) > b[4])
+	b[4] = i % 192;
+    }
+}
+
+int
+main ()
+{
+  A<int> a[4][3][2];
+  static int a2[4][3][2] = {{{ 0, 0 }, { 0, 0 }, { 0, 0 }},
+			    {{ 312, 381 }, { 295, 356 }, { 337, 335 }},
+			    {{ 1041, 975 }, { 1016, 1085 }, { 935, 1060 }},
+			    {{ 0, 0 }, { 0, 0 }, { 0, 0 }}};
+  A<int> (*p)[3][2] = &a[1];
+  M<int> y[5] = { 0, 1, 1, 1, 0 };
+  int y2[5] = { 0, 6561, 2401, 289, 0 };
+  char z2[10] = { 48, 49, 50, 51, 0, 0, 0, 0, 0, 0 };
+  B<long> w[1][2];
+  foo (p, y + 1, w);
+  for (int i = 0; i < 4; i++)
+    for (int j = 0; j < 3; j++)
+      for (int k = 0; k < 2; k++)
+	if (a[i][j][k].t != a2[i][j][k])
+	  __builtin_abort ();
+  for (int i = 0; i < 5; i++)
+    if (y[i].t != y2[i])
+      __builtin_abort ();
+  for (int i = 0; i < 10; i++)
+    if (z[i].t != z2[i])
+      __builtin_abort ();
+  if (w[0][0].t != ~0x249249L || w[0][1].t != ~0x249249L)
+    __builtin_abort ();
+  S s;
+  s.foo ();
+  for (int i = 0; i < 9; i++)
+    if (s.a[i].t != (i < 4 ? 0x55555555ULL : 0))
+      __builtin_abort ();
+  for (int i = 0; i < 4; i++)
+    for (int j = 0; j < 3; j++)
+      for (int k = 0; k < 2; k++)
+	if (a3[i][j][k].t != a2[i][j][k])
+	  __builtin_abort ();
+  for (int i = 0; i < 5; i++)
+    if (y3[i].t != y2[i])
+      __builtin_abort ();
+  for (int i = 0; i < 10; i++)
+    if (s.z[i].t != z2[i])
+      __builtin_abort ();
+  if (w3[0][0].t != ~0x249249L || w3[0][1].t != ~0x249249L)
+    __builtin_abort ();
+  if (s.b[0] != 78 || s.b[1] != 12 || s.b[2] != 22
+      || s.b[3] != 84 || s.b[4] != 127)
+    __builtin_abort ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/reduction-7.C b/libgomp/testsuite/libgomp.c++/reduction-7.C
new file mode 100644
index 00000000000..75f9d08aac4
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/reduction-7.C
@@ -0,0 +1,134 @@
+char z[10] = { 0 };
+
+__attribute__((noinline, noclone)) void
+foo (int (*&x)[3][2], int *y, long (&w)[1][2], int p1, long p2, long p3, int p4,
+     int p5, long p6, short p7)
+{
+  unsigned long long a[p7 + 4];
+  short b[p7];
+  for (int i = 0; i < p7 + 4; i++)
+    {
+      if (i < p7)
+	b[i] = -6;
+      a[i] = 0;
+    }
+  #pragma omp parallel for reduction(+:x[0:p1 + 1][:p2], z[:p3]) \
+			   reduction(*:y[:p4]) reduction(|:a[:p5]) \
+			   reduction(&:w[0:p6 - 1][:p6]) reduction(max:b)
+  for (int i = 0; i < 128; i++)
+    {
+      x[i / 64][i % 3][(i / 4) & 1] += i;
+      if ((i & 15) == 1)
+	y[0] *= 3;
+      if ((i & 31) == 2)
+	y[1] *= 7;
+      if ((i & 63) == 3)
+	y[2] *= 17;
+      z[i / 32] += (i & 3);
+      if (i < 4)
+	z[i] += i;
+      a[i / 32] |= 1ULL << (i & 30);
+      w[0][i & 1] &= ~(1L << (i / 17 * 3));
+      if ((i % 79) > b[0])
+	b[0] = i % 79;
+      if ((i % 13) > b[1])
+	b[1] = i % 13;
+      if ((i % 23) > b[2])
+	b[2] = i % 23;
+      if ((i % 85) > b[3])
+	b[3] = i % 85;
+      if ((i % 192) > b[4])
+	b[4] = i % 192;
+    }
+  for (int i = 0; i < 9; i++)
+    if (a[i] != (i < 4 ? 0x55555555ULL : 0))
+      __builtin_abort ();
+  if (b[0] != 78 || b[1] != 12 || b[2] != 22 || b[3] != 84 || b[4] != 127)
+    __builtin_abort ();
+}
+
+int a3[4][3][2];
+int (*p3)[3][2] = &a3[1];
+int y3[5] = { 0, 1, 1, 1, 0 };
+long w3[1][2] = { ~0L, ~0L };
+short bb[5];
+
+struct S
+{
+  int (*&x)[3][2];
+  int *y;
+  long (&w)[1][2];
+  char z[10];
+  short (&b)[5];
+  unsigned long long a[9];
+  S() : x(p3), y(y3+1), w(w3), z(), a(), b(bb) {}
+  __attribute__((noinline, noclone)) void foo (int, long, long, int, int, long, short);
+};
+
+void
+S::foo (int p1, long p2, long p3, int p4, int p5, long p6, short p7)
+{
+  #pragma omp parallel for reduction(+:x[0:p1 + 1][:p2], z[:p3]) \
+			   reduction(*:y[:p4]) reduction(|:a[:p5]) \
+			   reduction(&:w[0:p6 - 1][:p6]) reduction(max:b[0:p7])
+  for (int i = 0; i < 128; i++)
+    {
+      x[i / 64][i % 3][(i / 4) & 1] += i;
+      if ((i & 15) == 1)
+	y[0] *= 3;
+      if ((i & 31) == 2)
+	y[1] *= 7;
+      if ((i & 63) == 3)
+	y[2] *= 17;
+      z[i / 32] += (i & 3);
+      if (i < 4)
+	z[i] += i;
+      a[i / 32] |= 1ULL << (i & 30);
+      w[0][i & 1] &= ~(1L << (i / 17 * 3));
+      if ((i % 79) > b[0])
+	b[0] = i % 79;
+      if ((i % 13) > b[1])
+	b[1] = i % 13;
+      if ((i % 23) > b[2])
+	b[2] = i % 23;
+      if ((i % 85) > b[3])
+	b[3] = i % 85;
+      if ((i % 192) > b[4])
+	b[4] = i % 192;
+    }
+}
+
+int
+main ()
+{
+  int a[4][3][2] = {};
+  static int a2[4][3][2] = {{{ 0, 0 }, { 0, 0 }, { 0, 0 }},
+			    {{ 312, 381 }, { 295, 356 }, { 337, 335 }},
+			    {{ 1041, 975 }, { 1016, 1085 }, { 935, 1060 }},
+			    {{ 0, 0 }, { 0, 0 }, { 0, 0 }}};
+  int (*p)[3][2] = &a[1];
+  int y[5] = { 0, 1, 1, 1, 0 };
+  int y2[5] = { 0, 6561, 2401, 289, 0 };
+  char z2[10] = { 48, 49, 50, 51, 0, 0, 0, 0, 0, 0 };
+  long w[1][2] = { ~0L, ~0L };
+  foo (p, y + 1, w, 1, 3L, 4L, 3, 4, 2L, 5);
+  if (__builtin_memcmp (a, a2, sizeof (a))
+      || __builtin_memcmp (y, y2, sizeof (y))
+      || __builtin_memcmp (z, z2, sizeof (z))
+      || w[0][0] != ~0x249249L
+      || w[0][1] != ~0x249249L)
+    __builtin_abort ();
+  S s;
+  s.foo (1, 3L, 4L, 3, 4, 2L, 5);
+  for (int i = 0; i < 9; i++)
+    if (s.a[i] != (i < 4 ? 0x55555555ULL : 0))
+      __builtin_abort ();
+  if (__builtin_memcmp (a3, a2, sizeof (a3))
+      || __builtin_memcmp (y3, y2, sizeof (y3))
+      || __builtin_memcmp (s.z, z2, sizeof (s.z))
+      || w3[0][0] != ~0x249249L
+      || w3[0][1] != ~0x249249L)
+    __builtin_abort ();
+  if (bb[0] != 78 || bb[1] != 12 || bb[2] != 22 || bb[3] != 84 || bb[4] != 127)
+    __builtin_abort ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/reduction-8.C b/libgomp/testsuite/libgomp.c++/reduction-8.C
new file mode 100644
index 00000000000..cffd7cc2d4c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/reduction-8.C
@@ -0,0 +1,198 @@
+template <typename T>
+struct A
+{
+  A () { t = 0; }
+  A (T x) { t = x; }
+  A (const A &x) { t = x.t; }
+  ~A () {}
+  T t;
+};
+template <typename T>
+struct M
+{
+  M () { t = 1; }
+  M (T x) { t = x; }
+  M (const M &x) { t = x.t; }
+  ~M () {}
+  T t;
+};
+template <typename T>
+struct B
+{
+  B () { t = ~(T) 0; }
+  B (T x) { t = x; }
+  B (const B &x) { t = x.t; }
+  ~B () {}
+  T t;
+};
+template <typename T>
+void
+add (T &x, T &y)
+{
+  x.t += y.t;
+}
+template <typename T>
+void
+zero (T &x)
+{
+  x.t = 0;
+}
+template <typename T>
+void
+orit (T *x, T *y)
+{
+  y->t |= x->t;
+}
+B<long> bb;
+#pragma omp declare reduction(+:A<int>:omp_out.t += omp_in.t)
+#pragma omp declare reduction(+:A<char>:add (omp_out, omp_in)) initializer(zero (omp_priv))
+#pragma omp declare reduction(*:M<int>:omp_out.t *= omp_in.t) initializer(omp_priv = 1)
+#pragma omp declare reduction(|:A<unsigned long long>:orit (&omp_in, &omp_out))
+#pragma omp declare reduction(&:B<long>:omp_out.t = omp_out.t & omp_in.t) initializer(orit (&omp_priv, &omp_orig))
+#pragma omp declare reduction(maxb:short:omp_out = omp_in > omp_out ? omp_in : omp_out) initializer(omp_priv = -6)
+
+A<char> z[10];
+
+__attribute__((noinline, noclone)) void
+foo (A<int> (*&x)[3][2], M<int> *y, B<long> (&w)[1][2], int p1, long p2, long p3, int p4,
+     int p5, long p6, short p7)
+{
+  A<unsigned long long> a[p7 + 4];
+  short bb[p7];
+  short (&b)[p7] = bb;
+  for (int i = 0; i < p7; i++)
+    bb[i] = -6;
+  #pragma omp parallel for reduction(+:x[0:p1 + 1][:p2], z[:p3]) \
+			   reduction(*:y[:p4]) reduction(|:a[:p5]) \
+			   reduction(&:w[0:p6 - 1][:p6]) reduction(maxb:b)
+  for (int i = 0; i < 128; i++)
+    {
+      x[i / 64][i % 3][(i / 4) & 1].t += i;
+      if ((i & 15) == 1)
+	y[0].t *= 3;
+      if ((i & 31) == 2)
+	y[1].t *= 7;
+      if ((i & 63) == 3)
+	y[2].t *= 17;
+      z[i / 32].t += (i & 3);
+      if (i < 4)
+	z[i].t += i;
+      a[i / 32].t |= 1ULL << (i & 30);
+      w[0][i & 1].t &= ~(1L << (i / 17 * 3));
+      if ((i % 79) > b[0])
+	b[0] = i % 79;
+      if ((i % 13) > b[1])
+	b[1] = i % 13;
+      if ((i % 23) > b[2])
+	b[2] = i % 23;
+      if ((i % 85) > b[3])
+	b[3] = i % 85;
+      if ((i % 192) > b[4])
+	b[4] = i % 192;
+    }
+  for (int i = 0; i < 9; i++)
+    if (a[i].t != (i < 4 ? 0x55555555ULL : 0))
+      __builtin_abort ();
+  if (bb[0] != 78 || bb[1] != 12 || bb[2] != 22 || bb[3] != 84 || bb[4] != 127)
+    __builtin_abort ();
+}
+
+A<int> a3[4][3][2];
+A<int> (*p3)[3][2] = &a3[1];
+M<int> y3[5] = { 0, 1, 1, 1, 0 };
+B<long> w3[1][2];
+
+struct S
+{
+  A<int> (*&x)[3][2];
+  M<int> *y;
+  B<long> (&w)[1][2];
+  A<char> z[10];
+  short b[5];
+  A<unsigned long long> a[9];
+  S() : x(p3), y(y3+1), w(w3), z(), a(), b() {}
+  __attribute__((noinline, noclone)) void foo (int, long, long, int, int, long, short);
+};
+
+void
+S::foo (int p1, long p2, long p3, int p4, int p5, long p6, short p7)
+{
+  #pragma omp parallel for reduction(+:x[0:p1 + 1][:p2][0:2], z[:p3]) \
+			   reduction(*:y[:p4]) reduction(|:a[:p5]) \
+			   reduction(&:w[0:p6 - 1][:p6]) reduction(maxb:b)
+  for (int i = 0; i < 128; i++)
+    {
+      x[i / 64][i % 3][(i / 4) & 1].t += i;
+      if ((i & 15) == 1)
+	y[0].t *= 3;
+      if ((i & 31) == 2)
+	y[1].t *= 7;
+      if ((i & 63) == 3)
+	y[2].t *= 17;
+      z[i / 32].t += (i & 3);
+      if (i < 4)
+	z[i].t += i;
+      a[i / 32].t |= 1ULL << (i & 30);
+      w[0][i & 1].t &= ~(1L << (i / 17 * 3));
+      if ((i % 79) > b[0])
+	b[0] = i % 79;
+      if ((i % 13) > b[1])
+	b[1] = i % 13;
+      if ((i % 23) > b[2])
+	b[2] = i % 23;
+      if ((i % 85) > b[3])
+	b[3] = i % 85;
+      if ((i % 192) > b[4])
+	b[4] = i % 192;
+    }
+}
+
+int
+main ()
+{
+  A<int> a[4][3][2];
+  static int a2[4][3][2] = {{{ 0, 0 }, { 0, 0 }, { 0, 0 }},
+			    {{ 312, 381 }, { 295, 356 }, { 337, 335 }},
+			    {{ 1041, 975 }, { 1016, 1085 }, { 935, 1060 }},
+			    {{ 0, 0 }, { 0, 0 }, { 0, 0 }}};
+  A<int> (*p)[3][2] = &a[1];
+  M<int> y[5] = { 0, 1, 1, 1, 0 };
+  int y2[5] = { 0, 6561, 2401, 289, 0 };
+  char z2[10] = { 48, 49, 50, 51, 0, 0, 0, 0, 0, 0 };
+  B<long> w[1][2];
+  foo (p, y + 1, w, 1, 3L, 4L, 3, 4, 2L, 5);
+  for (int i = 0; i < 4; i++)
+    for (int j = 0; j < 3; j++)
+      for (int k = 0; k < 2; k++)
+	if (a[i][j][k].t != a2[i][j][k])
+	  __builtin_abort ();
+  for (int i = 0; i < 5; i++)
+    if (y[i].t != y2[i])
+      __builtin_abort ();
+  for (int i = 0; i < 10; i++)
+    if (z[i].t != z2[i])
+      __builtin_abort ();
+  if (w[0][0].t != ~0x249249L || w[0][1].t != ~0x249249L)
+    __builtin_abort ();
+  S s;
+  s.foo (1, 3L, 4L, 3, 4, 2L, 5);
+  for (int i = 0; i < 9; i++)
+    if (s.a[i].t != (i < 4 ? 0x55555555ULL : 0))
+      __builtin_abort ();
+  for (int i = 0; i < 4; i++)
+    for (int j = 0; j < 3; j++)
+      for (int k = 0; k < 2; k++)
+	if (a3[i][j][k].t != a2[i][j][k])
+	  __builtin_abort ();
+  for (int i = 0; i < 5; i++)
+    if (y3[i].t != y2[i])
+      __builtin_abort ();
+  for (int i = 0; i < 10; i++)
+    if (s.z[i].t != z2[i])
+      __builtin_abort ();
+  if (w3[0][0].t != ~0x249249L || w3[0][1].t != ~0x249249L)
+    __builtin_abort ();
+  if (s.b[0] != 78 || s.b[1] != 12 || s.b[2] != 22
+      || s.b[3] != 84 || s.b[4] != 127)
+    __builtin_abort ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/reduction-9.C b/libgomp/testsuite/libgomp.c++/reduction-9.C
new file mode 100644
index 00000000000..117a8f66c52
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/reduction-9.C
@@ -0,0 +1,130 @@
+char z[10] = { 0 };
+
+template <int N>
+__attribute__((noinline, noclone)) void
+foo (int (*&x)[3][N], int *y, long (&w)[1][N])
+{
+  unsigned long long a[9] = {};
+  short b[5] = {};
+  #pragma omp parallel for reduction(+:x[0:N][:][0:N], z[:4]) \
+			   reduction(*:y[:3]) reduction(|:a[:4]) \
+			   reduction(&:w[0:][:N]) reduction(max:b)
+  for (int i = 0; i < 128; i++)
+    {
+      x[i / 64][i % 3][(i / 4) & 1] += i;
+      if ((i & 15) == 1)
+	y[0] *= 3;
+      if ((i & 31) == N)
+	y[1] *= 7;
+      if ((i & 63) == 3)
+	y[N] *= 17;
+      z[i / 32] += (i & 3);
+      if (i < 4)
+	z[i] += i;
+      a[i / 32] |= 1ULL << (i & 30);
+      w[0][i & 1] &= ~(1L << (i / 17 * 3));
+      if ((i % 79) > b[0])
+	b[0] = i % 79;
+      if ((i % 13) > b[1])
+	b[1] = i % 13;
+      if ((i % 23) > b[N])
+	b[N] = i % 23;
+      if ((i % 85) > b[3])
+	b[3] = i % 85;
+      if ((i % 192) > b[4])
+	b[4] = i % 192;
+    }
+  for (int i = 0; i < 9; i++)
+    if (a[i] != (i < 4 ? 0x55555555ULL : 0))
+      __builtin_abort ();
+  if (b[0] != 78 || b[1] != 12 || b[N] != 22 || b[3] != 84 || b[4] != 127)
+    __builtin_abort ();
+}
+
+int a3[4][3][2];
+int (*p3)[3][2] = &a3[1];
+int y3[5] = { 0, 1, 1, 1, 0 };
+long w3[1][2] = { ~0L, ~0L };
+short bb[5];
+
+template <int N>
+struct S
+{
+  int (*&x)[3][N];
+  int *y;
+  long (&w)[1][N];
+  char z[10];
+  short (&b)[5];
+  unsigned long long a[9];
+  S() : x(p3), y(y3+1), w(w3), z(), a(), b(bb) {}
+  __attribute__((noinline, noclone)) void foo ();
+};
+
+template <int N>
+void
+S<N>::foo ()
+{
+  #pragma omp parallel for reduction(+:x[0:N][:][0:N], z[:4]) \
+			   reduction(*:y[:3]) reduction(|:a[:4]) \
+			   reduction(&:w[0:][:N]) reduction(max:b)
+  for (int i = 0; i < 128; i++)
+    {
+      x[i / 64][i % 3][(i / 4) & 1] += i;
+      if ((i & 15) == 1)
+	y[0] *= 3;
+      if ((i & 31) == N)
+	y[1] *= 7;
+      if ((i & 63) == 3)
+	y[N] *= 17;
+      z[i / 32] += (i & 3);
+      if (i < 4)
+	z[i] += i;
+      a[i / 32] |= 1ULL << (i & 30);
+      w[0][i & 1] &= ~(1L << (i / 17 * 3));
+      if ((i % 79) > b[0])
+	b[0] = i % 79;
+      if ((i % 13) > b[1])
+	b[1] = i % 13;
+      if ((i % 23) > b[N])
+	b[N] = i % 23;
+      if ((i % 85) > b[3])
+	b[3] = i % 85;
+      if ((i % 192) > b[4])
+	b[4] = i % 192;
+    }
+}
+
+int
+main ()
+{
+  int a[4][3][2] = {};
+  static int a2[4][3][2] = {{{ 0, 0 }, { 0, 0 }, { 0, 0 }},
+			    {{ 312, 381 }, { 295, 356 }, { 337, 335 }},
+			    {{ 1041, 975 }, { 1016, 1085 }, { 935, 1060 }},
+			    {{ 0, 0 }, { 0, 0 }, { 0, 0 }}};
+  int (*p)[3][2] = &a[1];
+  int y[5] = { 0, 1, 1, 1, 0 };
+  int y2[5] = { 0, 6561, 2401, 289, 0 };
+  char z2[10] = { 48, 49, 50, 51, 0, 0, 0, 0, 0, 0 };
+  long w[1][2] = { ~0L, ~0L };
+  foo<2> (p, y + 1, w);
+  if (__builtin_memcmp (a, a2, sizeof (a))
+      || __builtin_memcmp (y, y2, sizeof (y))
+      || __builtin_memcmp (z, z2, sizeof (z))
+      || w[0][0] != ~0x249249L
+      || w[0][1] != ~0x249249L)
+    __builtin_abort ();
+  S<2> s;
+  s.foo ();
+  for (int i = 0; i < 9; i++)
+    if (s.a[i] != (i < 4 ? 0x55555555ULL : 0))
+      __builtin_abort ();
+  if (__builtin_memcmp (a3, a2, sizeof (a3))
+      || __builtin_memcmp (y3, y2, sizeof (y3))
+      || __builtin_memcmp (s.z, z2, sizeof (s.z))
+      || w3[0][0] != ~0x249249L
+      || w3[0][1] != ~0x249249L)
+    __builtin_abort ();
+  if (bb[0] != 78 || bb[1] != 12 || bb[2] != 22 || bb[3] != 84 || bb[4] != 127)
+    __builtin_abort ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/reference-1.C b/libgomp/testsuite/libgomp.c++/reference-1.C
new file mode 100644
index 00000000000..f2a78614a13
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/reference-1.C
@@ -0,0 +1,57 @@
+// { dg-do run }
+
+#include <omp.h>
+
+__attribute__((noinline, noclone)) void
+foo (int &a, short &d, char &g)
+{
+  unsigned long b = 12;
+  unsigned long &c = b;
+  long long e = 21;
+  long long &f = e;
+  unsigned int h = 12;
+  unsigned int &k = h;
+  #pragma omp parallel default(none) private(a, c) firstprivate(d, f) shared(g, k)
+    {
+      int i = omp_get_thread_num ();
+      a = i;
+      c = 2 * i;
+      if (d != 27 || f != 21)
+	__builtin_abort ();
+      d = 3 * (i & 0xfff);
+      f = 4 * i;
+      #pragma omp barrier
+      if (a != i || c != 2 * i || d != 3 * (i & 0xfff) || f != 4 * i)
+	__builtin_abort ();
+      #pragma omp for lastprivate(g, k)
+      for (int j = 0; j < 32; j++)
+	{
+	  g = j;
+	  k = 3 * j;
+	}
+    }
+  if (g != 31 || k != 31 * 3)
+    __builtin_abort ();
+  #pragma omp parallel for firstprivate (g, k) lastprivate (g, k)
+  for (int j = 0; j < 32; j++)
+    {
+      if (g != 31 || k != 31 * 3)
+	__builtin_abort ();
+      if (j == 31)
+	{
+	  g = 29;
+	  k = 138;
+	}
+    }
+  if (g != 29 || k != 138)
+    __builtin_abort ();
+}
+
+int
+main ()
+{
+  int a = 5;
+  short d = 27;
+  char g = ' ';
+  foo (a, d, g);
+}
diff --git a/libgomp/testsuite/libgomp.c++/simd14.C b/libgomp/testsuite/libgomp.c++/simd14.C
new file mode 100644
index 00000000000..dc18cb619ac
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/simd14.C
@@ -0,0 +1,43 @@
+// { dg-do run }
+// { dg-options "-O2" }
+// { dg-additional-options "-msse2" { target sse2_runtime } }
+// { dg-additional-options "-mavx" { target avx_runtime } }
+
+int a[1024];
+short b[2048];
+
+static inline void
+bar (int &x, unsigned long long &y, short *&z)
+{
+  a[x] = x + y + *z;
+  x++;
+  y += 17;
+  z += 2;
+}
+
+__attribute__((noinline, noclone)) int
+foo (unsigned long long &s, short *&t)
+{
+  int i, j = 0;
+  int &r = j;
+#pragma omp parallel for simd linear(r) linear(s:17ULL) linear(t:2)
+  for (i = 0; i < 1024; i++)
+    bar (r, s, t);
+  return j;
+}
+
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 2048; i++)
+    b[i] = 3 * i;
+  unsigned long long s = 12;
+  short *t = b;
+  int j = foo (s, t);
+  for (i = 0; i < 1024; i++)
+    if (a[i] != 12 + 24 * i)
+      __builtin_abort ();
+  if (j != 1024 || s != 12 + 1024 * 17ULL || t != &b[2048])
+    __builtin_abort ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-10.C b/libgomp/testsuite/libgomp.c++/target-10.C
new file mode 100644
index 00000000000..860773eed15
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-10.C
@@ -0,0 +1,154 @@
+extern "C" void abort (void);
+union U { int x; long long y; };
+struct T { int a; union U b; int c; };
+struct S { int s; int u; T v; int x[10]; union U w; int y[10]; int z[10]; };
+volatile int z;
+
+template <typename R>
+void
+foo ()
+{
+  R s;
+  s.template s = 0;
+  s.u = 1;
+  s.v.a = 2;
+  s.v.b.y = 3LL;
+  s.v.c = 19;
+  s.w.x = 4;
+  s.template x[0] = 7;
+  s.x[1] = 8;
+  s.y[3] = 9;
+  s.y[4] = 10;
+  s.y[5] = 11;
+  int err = 0;
+  #pragma omp target map (to:s.template v.template b, s.u, s.x[0:z + 2]) \
+		     map (tofrom:s.y[3:3]) \
+		     map (from: s.w, s.template z[z + 1:z + 3], err)
+  {
+    err = 0;
+    if (s.u != 1 || s.v.b.y != 3LL || s.x[0] != 7 || s.x[1] != 8
+	|| s.y[3] != 9 || s.y[4] != 10 || s.y[5] != 11)
+      err = 1;
+    s.w.x = 6;
+    s.y[3] = 12;
+    s.y[4] = 13;
+    s.y[5] = 14;
+    s.z[1] = 15;
+    s.z[2] = 16;
+    s.z[3] = 17;
+  }
+  if (err || s.w.x != 6 || s.y[3] != 12 || s.y[4] != 13 || s.y[5] != 14
+      || s.z[1] != 15 || s.z[2] != 16 || s.z[3] != 17)
+    abort ();
+  s.u++;
+  s.v.a++;
+  s.v.b.y++;
+  s.w.x++;
+  s.x[1] = 18;
+  s.z[0] = 19;
+  #pragma omp target data map (tofrom: s)
+  #pragma omp target map (always to: s.template w, s.x[1], err) map (alloc:s.u, s. template v.template b, s.z[z:z + 1])
+  {
+    err = 0;
+    if (s.u != 2 || s.v.b.y != 4LL || s.w.x != 7 || s.x[1] != 18 || s.z[0] != 19)
+      err = 1;
+    s.w.x = 8;
+    s.x[1] = 20;
+    s.z[0] = 21;
+  }
+  if (err || s.w.x != 8 || s.x[1] != 20 || s.z[0] != 21)
+    abort ();
+  s.u++;
+  s.v.a++;
+  s.v.b.y++;
+  s.w.x++;
+  s.x[0] = 22;
+  s.x[1] = 23;
+  #pragma omp target data map (from: s.w, s.x[0:2]) map (to: s.v.b, s.u)
+  #pragma omp target map (always to: s.w, s.x[0:2], err) map (alloc:s.u, s.v.b)
+  {
+    err = 0;
+    if (s.u != 3 || s.v.b.y != 5LL || s.w.x != 9 || s.x[0] != 22 || s.x[1] != 23)
+      err = 1;
+    s.w.x = 11;
+    s.x[0] = 24;
+    s.x[1] = 25;
+  }
+  if (err || s.w.x != 11 || s.x[0] != 24 || s.x[1] != 25)
+    abort ();
+}
+
+int
+main ()
+{
+  S s;
+  s.s = 0;
+  s.u = 1;
+  s.v.a = 2;
+  s.v.b.y = 3LL;
+  s.v.c = 19;
+  s.w.x = 4;
+  s.x[0] = 7;
+  s.x[1] = 8;
+  s.y[3] = 9;
+  s.y[4] = 10;
+  s.y[5] = 11;
+  int err = 0;
+  #pragma omp target map (to:s.v.b, s.u, s.x[0:z + 2]) \
+		     map (tofrom:s.y[3:3]) \
+		     map (from: s.w, s.z[z + 1:z + 3], err)
+  {
+    err = 0;
+    if (s.u != 1 || s.v.b.y != 3LL || s.x[0] != 7 || s.x[1] != 8
+	|| s.y[3] != 9 || s.y[4] != 10 || s.y[5] != 11)
+      err = 1;
+    s.w.x = 6;
+    s.y[3] = 12;
+    s.y[4] = 13;
+    s.y[5] = 14;
+    s.z[1] = 15;
+    s.z[2] = 16;
+    s.z[3] = 17;
+  }
+  if (err || s.w.x != 6 || s.y[3] != 12 || s.y[4] != 13 || s.y[5] != 14
+      || s.z[1] != 15 || s.z[2] != 16 || s.z[3] != 17)
+    abort ();
+  s.u++;
+  s.v.a++;
+  s.v.b.y++;
+  s.w.x++;
+  s.x[1] = 18;
+  s.z[0] = 19;
+  #pragma omp target data map (tofrom: s)
+  #pragma omp target map (always to: s.w, s.x[1], err) map (alloc:s.u, s.v.b, s.z[z:z + 1])
+  {
+    err = 0;
+    if (s.u != 2 || s.v.b.y != 4LL || s.w.x != 7 || s.x[1] != 18 || s.z[0] != 19)
+      err = 1;
+    s.w.x = 8;
+    s.x[1] = 20;
+    s.z[0] = 21;
+  }
+  if (err || s.w.x != 8 || s.x[1] != 20 || s.z[0] != 21)
+    abort ();
+  s.u++;
+  s.v.a++;
+  s.v.b.y++;
+  s.w.x++;
+  s.x[0] = 22;
+  s.x[1] = 23;
+  #pragma omp target data map (from: s.w, s.x[0:2]) map (to: s.v.b, s.u)
+  #pragma omp target map (always to: s.w, s.x[0:2], err) map (alloc:s.u, s.v.b)
+  {
+    err = 0;
+    if (s.u != 3 || s.v.b.y != 5LL || s.w.x != 9 || s.x[0] != 22 || s.x[1] != 23)
+      err = 1;
+    s.w.x = 11;
+    s.x[0] = 24;
+    s.x[1] = 25;
+  }
+  if (err || s.w.x != 11 || s.x[0] != 24 || s.x[1] != 25)
+    abort ();
+  foo <S> ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-11.C b/libgomp/testsuite/libgomp.c++/target-11.C
new file mode 100644
index 00000000000..fe99603351d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-11.C
@@ -0,0 +1,121 @@
+extern "C" void abort ();
+struct T { int a; int *b; int c; char (&d)[10]; };
+struct S { int *s; char *u; T v; short *w; short *&x; };
+volatile int z;
+
+template <typename A, typename B, typename C, typename D>
+void
+foo ()
+{
+  A d[10];
+  B *e;
+  C a[32], i;
+  A b[32];
+  B c[32];
+  for (i = 0; i < 32; i++)
+    {
+      a[i] = i;
+      b[i] = 32 + i;
+      c[i] = 64 + i;
+    }
+  for (i = 0; i < 10; i++)
+    d[i] = 17 + i;
+  e = c + 18;
+  D s = { a, b + 2, { 0, a + 16, 0, d }, c + 3, e };
+  int err = 0;
+  #pragma omp target map (to:s.v.b[0:z + 7], s.template u[z + 1:z + 4]) \
+		     map (tofrom:s.s[3:3], s. template v. template d[z + 1:z + 3]) \
+		     map (from: s.w[z:4], s.x[1:3], err) private (i)
+  {
+    err = 0;
+    for (i = 0; i < 7; i++)
+      if (s.v.b[i] != 16 + i)
+	err = 1;
+    for (i = 1; i < 5; i++)
+      if (s.u[i] != 34 + i)
+	err = 1;
+    for (i = 3; i < 6; i++)
+      if (s.s[i] != i)
+	err = 1;
+      else
+	s.s[i] = 128 + i;
+    for (i = 1; i < 4; i++)
+      if (s.v.d[i] != 17 + i)
+	err = 1;
+      else
+	s.v.d[i] = 23 + i;
+    for (i = 0; i < 4; i++)
+      s.w[i] = 96 + i;
+    for (i = 1; i < 4; i++)
+      s.x[i] = 173 + i;
+  }
+  if (err)
+    abort ();
+  for (i = 0; i < 32; i++)
+    if (a[i] != ((i >= 3 && i < 6) ? 128 + i : i)
+	|| b[i] != 32 + i
+	|| c[i] != ((i >= 3 && i < 7) ? 93 + i : ((i >= 19 && i < 22) ? 155 + i : 64 + i)))
+      abort ();
+  for (i = 0; i < 10; i++)
+    if (d[i] != ((i >= 1 && i < 4) ? 23 + i : 17 + i))
+      abort ();
+}
+
+int
+main ()
+{
+  char d[10];
+  short *e;
+  int a[32], i;
+  char b[32];
+  short c[32];
+  for (i = 0; i < 32; i++)
+    {
+      a[i] = i;
+      b[i] = 32 + i;
+      c[i] = 64 + i;
+    }
+  for (i = 0; i < 10; i++)
+    d[i] = 17 + i;
+  e = c + 18;
+  S s = { a, b + 2, { 0, a + 16, 0, d }, c + 3, e };
+  int err = 0;
+  #pragma omp target map (to:s.v.b[0:z + 7], s.u[z + 1:z + 4]) \
+		     map (tofrom:s.s[3:3], s.v.d[z + 1:z + 3]) \
+		     map (from: s.w[z:4], s.x[1:3], err) private (i)
+  {
+    err = 0;
+    for (i = 0; i < 7; i++)
+      if (s.v.b[i] != 16 + i)
+	err = 1;
+    for (i = 1; i < 5; i++)
+      if (s.u[i] != 34 + i)
+	err = 1;
+    for (i = 3; i < 6; i++)
+      if (s.s[i] != i)
+	err = 1;
+      else
+	s.s[i] = 128 + i;
+    for (i = 1; i < 4; i++)
+      if (s.v.d[i] != 17 + i)
+	err = 1;
+      else
+	s.v.d[i] = 23 + i;
+    for (i = 0; i < 4; i++)
+      s.w[i] = 96 + i;
+    for (i = 1; i < 4; i++)
+      s.x[i] = 173 + i;
+  }
+  if (err)
+    abort ();
+  for (i = 0; i < 32; i++)
+    if (a[i] != ((i >= 3 && i < 6) ? 128 + i : i)
+	|| b[i] != 32 + i
+	|| c[i] != ((i >= 3 && i < 7) ? 93 + i : ((i >= 19 && i < 22) ? 155 + i : 64 + i)))
+      abort ();
+  for (i = 0; i < 10; i++)
+    if (d[i] != ((i >= 1 && i < 4) ? 23 + i : 17 + i))
+      abort ();
+  foo <char, short, int, S> ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-12.C b/libgomp/testsuite/libgomp.c++/target-12.C
new file mode 100644
index 00000000000..3b4ed57df68
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-12.C
@@ -0,0 +1,93 @@
+extern "C" void abort (void);
+struct S { int s; int *u; int v[5]; };
+volatile int z;
+
+template <typename T>
+void
+foo ()
+{
+  int u[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, err = 0;
+  T s = { 9, u + 3, { 10, 11, 12, 13, 14 } };
+  int *v = u + 4;
+  #pragma omp target enter data map (to: s.s, s.template u[0:5]) map (alloc: s.template v[1:3])
+  s.s++;
+  u[3]++;
+  s.v[1]++;
+  #pragma omp target update to (s.template s) to (s.u[0:2], s.v[1:3])
+  #pragma omp target map (alloc: s.s, s.v[1:3]) map (from: err)
+  {
+    err = 0;
+    if (s.s != 10 || s.v[1] != 12 || s.v[2] != 12 || s.v[3] != 13)
+      err = 1;
+    if (v[-1] != 4 || v[0] != 4 || v[1] != 5 || v[2] != 6 || v[3] != 7)
+      err = 1;
+    s.s++;
+    s.v[2] += 2;
+    v[-1] = 5;
+    v[3] = 9;
+  }
+  if (err)
+    abort ();
+  #pragma omp target map (alloc: s.u[0:5])
+  {
+    err = 0;
+    if (s.u[0] != 5 || s.u[1] != 4 || s.u[2] != 5 || s.u[3] != 6 || s.u[4] != 9)
+      err = 1;
+    s.u[1] = 12;
+  }
+  #pragma omp target update from (s.s, s.u[0:5]) from (s.v[1:3])
+  if (err || s.s != 11 || u[0] != 0 || u[1] != 1 || u[2] != 2 || u[3] != 5
+      || u[4] != 12 || u[5] != 5 || u[6] != 6 || u[7] != 9 || u[8] != 8
+      || u[9] != 9 || s.v[0] != 10 || s.v[1] != 12 || s.v[2] != 14
+      || s.v[3] != 13 || s.v[4] != 14)
+    abort ();
+  #pragma omp target exit data map (release: s.s)
+  #pragma omp target exit data map (release: s.u[0:5])
+  #pragma omp target exit data map (delete: s.v[1:3])
+  #pragma omp target exit data map (release: s.s)
+}
+
+int
+main ()
+{
+  int u[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, err = 0;
+  S s = { 9, u + 3, { 10, 11, 12, 13, 14 } };
+  int *v = u + 4;
+  #pragma omp target enter data map (to: s.s, s.u[0:5]) map (alloc: s.v[1:3])
+  s.s++;
+  u[3]++;
+  s.v[1]++;
+  #pragma omp target update to (s.s) to (s.u[0:2], s.v[1:3])
+  #pragma omp target map (alloc: s.s, s.v[1:3]) map (from: err)
+  {
+    err = 0;
+    if (s.s != 10 || s.v[1] != 12 || s.v[2] != 12 || s.v[3] != 13)
+      err = 1;
+    if (v[-1] != 4 || v[0] != 4 || v[1] != 5 || v[2] != 6 || v[3] != 7)
+      err = 1;
+    s.s++;
+    s.v[2] += 2;
+    v[-1] = 5;
+    v[3] = 9;
+  }
+  if (err)
+    abort ();
+  #pragma omp target map (alloc: s.u[0:5])
+  {
+    err = 0;
+    if (s.u[0] != 5 || s.u[1] != 4 || s.u[2] != 5 || s.u[3] != 6 || s.u[4] != 9)
+      err = 1;
+    s.u[1] = 12;
+  }
+  #pragma omp target update from (s.s, s.u[0:5]) from (s.v[1:3])
+  if (err || s.s != 11 || u[0] != 0 || u[1] != 1 || u[2] != 2 || u[3] != 5
+      || u[4] != 12 || u[5] != 5 || u[6] != 6 || u[7] != 9 || u[8] != 8
+      || u[9] != 9 || s.v[0] != 10 || s.v[1] != 12 || s.v[2] != 14
+      || s.v[3] != 13 || s.v[4] != 14)
+    abort ();
+  #pragma omp target exit data map (release: s.s)
+  #pragma omp target exit data map (release: s.u[0:5])
+  #pragma omp target exit data map (always, delete: s.v[1:3])
+  #pragma omp target exit data map (release: s.s)
+  #pragma omp target exit data map (always delete : s.v[1:3])
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-2.C b/libgomp/testsuite/libgomp.c++/target-2.C
index 35e910acc2e..1eab7f29b4a 100644
--- a/libgomp/testsuite/libgomp.c++/target-2.C
+++ b/libgomp/testsuite/libgomp.c++/target-2.C
@@ -33,7 +33,8 @@ fn2 (int x, double (&dr) [1024], double *&er)
   int j;
   fn1 (hr + 2 * x, ir + 2 * x, x);
   #pragma omp target map(to: br[:x], cr[0:x], dr[x:x], er[x:x]) \
-		     map(to: fr[0:x], gr[0:x], hr[2 * x:x], ir[2 * x:x])
+		     map(to: fr[0:x], gr[0:x], hr[2 * x:x], ir[2 * x:x]) \
+		     map(tofrom: s)
     #pragma omp parallel for reduction(+:s)
       for (j = 0; j < x; j++)
 	s += br[j] * cr[j] + dr[x + j] + er[x + j]
diff --git a/libgomp/testsuite/libgomp.c++/target-5.C b/libgomp/testsuite/libgomp.c++/target-5.C
new file mode 100644
index 00000000000..6639be394c6
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-5.C
@@ -0,0 +1 @@
+#include "../libgomp.c/target-13.c"
diff --git a/libgomp/testsuite/libgomp.c++/target-6.C b/libgomp/testsuite/libgomp.c++/target-6.C
new file mode 100644
index 00000000000..8dbafb0437b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-6.C
@@ -0,0 +1,64 @@
+extern "C" void abort (void);
+struct S { int s, t; };
+
+void
+foo (int &x, int &y, S &u, S &v, double &s, double &t)
+{
+  int err = 0, i;
+  int a[y - 2], b[y - 2];
+  int (&c)[y - 2] = a, (&d)[y - 2] = b;
+  for (i = 0; i < y - 2; i++)
+    {
+      c[i] = i;
+      d[i] = 3 + i;
+    }
+  #pragma omp target private (x, u, s, c, i) firstprivate (y, v, t, d) map(from:err)
+  {
+    x = y;
+    u = v;
+    s = t;
+    for (i = 0; i < y - 2; i++)
+      c[i] = d[i];
+    err = (x != 6 || y != 6
+	   || u.s != 9 || u.t != 10 || v.s != 9 || v.t != 10
+	   || s != 12.5 || t != 12.5);
+    for (i = 0; i < y - 2; i++)
+      if (d[i] != 3 + i || c[i] != 3 + i)
+	err = 1;
+      else
+	{
+	  c[i] += 2 * i;
+	  d[i] += i;
+	}
+    x += 1;
+    y += 2;
+    u.s += 3;
+    v.t += 4;
+    s += 2.5;
+    t += 3.0;
+    if (x != 7 || y != 8
+	|| u.s != 12 || u.t != 10 || v.s != 9 || v.t != 14
+	|| s != 15.0 || t != 15.5)
+      err = 1;
+    for (i = 0; i < y - 4; i++)
+      if (d[i] != 3 + 2 * i || c[i] != 3 + 3 * i)
+	err = 1;
+  }
+  if (err || x != 5 || y != 6
+      || u.s != 7 || u.t != 8 || v.s != 9 || v.t != 10
+      || s != 11.5 || t != 12.5)
+    abort ();
+  for (i = 0; i < y - 2; i++)
+    if (d[i] != 3 + i || c[i] != i)
+      abort ();
+}
+
+int
+main ()
+{
+  int x = 5, y = 6;
+  S u = { 7, 8 }, v = { 9, 10 };
+  double s = 11.5, t = 12.5;
+  foo (x, y, u, v, s, t);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-7.C b/libgomp/testsuite/libgomp.c++/target-7.C
new file mode 100644
index 00000000000..e13c50f26da
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-7.C
@@ -0,0 +1,90 @@
+extern "C" void abort ();
+
+void
+foo (int *x, int *&y, int (&z)[15])
+{
+  int a[10], b[15], err, i;
+  for (i = 0; i < 10; i++)
+    a[i] = 7 * i;
+  for (i = 0; i < 15; i++)
+    b[i] = 8 * i;
+  #pragma omp target map(to:x[5:10], y[5:10], z[5:10], a[0:10], b[5:10]) map(from:err)
+  {
+    err = 0;
+    for (i = 0; i < 10; i++)
+      if (x[5 + i] != 20 + 4 * i
+	  || y[5 + i] != 25 + 5 * i
+	  || z[5 + i] != 30 + 6 * i
+	  || a[i] != 7 * i
+	  || b[5 + i] != 40 + 8 * i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+}
+
+void
+bar (int n, int v)
+{
+  int a[n], b[n], c[n], d[n], e[n], err, i;
+  int (*x)[n] = &c;
+  int (*y2)[n] = &d;
+  int (*&y)[n] = y2;
+  int (&z)[n] = e;
+  for (i = 0; i < n; i++)
+    {
+      (*x)[i] = 4 * i;
+      (*y)[i] = 5 * i;
+      z[i] = 6 * i;
+      a[i] = 7 * i;
+      b[i] = 8 * i;
+    }
+  #pragma omp target map(to:x[0][5:10], y[0][5:10], z[5:10], a[0:10], b[5:10]) map(from:err)
+  {
+    err = 0;
+    for (i = 0; i < 10; i++)
+      if ((*x)[5 + i] != 20 + 4 * i
+	  || (*y)[5 + i] != 25 + 5 * i
+	  || z[5 + i] != 30 + 6 * i
+	  || a[i] != 7 * i
+	  || b[5 + i] != 40 + 8 * i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+  for (i = 0; i < n; i++)
+    {
+      (*x)[i] = 9 * i;
+      (*y)[i] = 10 * i;
+      z[i] = 11 * i;
+      a[i] = 12 * i;
+      b[i] = 13 * i;
+    }
+  #pragma omp target map(to:x[0][v:v+5], y[0][v:v+5], z[v:v+5], a[v-5:v+5], b[v:v+5]) map(from:err)
+  {
+    err = 0;
+    for (i = 0; i < 10; i++)
+      if ((*x)[5 + i] != 45 + 9 * i
+	  || (*y)[5 + i] != 50 + 10 * i
+	  || z[5 + i] != 55 + 11 * i
+	  || a[i] != 12 * i
+	  || b[5 + i] != 65 + 13 * i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+}
+
+int
+main ()
+{
+  int x[15], y2[15], z[15], *y = y2, i;
+  for (i = 0; i < 15; i++)
+    {
+      x[i] = 4 * i;
+      y[i] = 5 * i;
+      z[i] = 6 * i;
+    }
+  foo (x, y, z);
+  bar (15, 5);
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-8.C b/libgomp/testsuite/libgomp.c++/target-8.C
new file mode 100644
index 00000000000..d886b476754
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-8.C
@@ -0,0 +1,58 @@
+extern "C" void abort ();
+struct S { int a; };
+#ifdef __SIZEOF_INT128__
+typedef __int128 T;
+#else
+typedef long long int T;
+#endif
+
+void
+foo (T a, int b, struct S c)
+{
+  int err;
+  #pragma omp target firstprivate (a, b, c) map(from:err)
+  {
+    err = 0;
+    if (a != 131 || b != 276 || c.a != 59)
+      err = 1;
+    a = 936;
+    b = 27;
+    c.a = 98;
+    if (a != 936 || b != 27 || c.a != 98)
+      err = 1;
+  }
+  if (err || a != 131 || b != 276 || c.a != 59)
+    abort ();
+}
+
+void
+bar (T &a, int &b, struct S &c)
+{
+  int err;
+  #pragma omp target firstprivate (a, b, c) map(from:err)
+  {
+    err = 0;
+    if (a != 131 || b != 276 || c.a != 59)
+      err = 1;
+    a = 936;
+    b = 27;
+    c.a = 98;
+    if (a != 936 || b != 27 || c.a != 98)
+      err = 1;
+  }
+  if (err || a != 131 || b != 276 || c.a != 59)
+    abort ();
+}
+
+int
+main ()
+{
+  T a = 131;
+  int b = 276;
+  struct S c;
+  c.a = 59;
+  foo (a, b, c);
+  bar (a, b, c);
+  if (a != 131 || b != 276 || c.a != 59)
+    abort ();
+}
diff --git a/libgomp/testsuite/libgomp.c++/target-9.C b/libgomp/testsuite/libgomp.c++/target-9.C
new file mode 100644
index 00000000000..a5d171b0b3d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/target-9.C
@@ -0,0 +1,73 @@
+extern "C" void abort (void);
+
+void
+foo (int *&p, int (&s)[5], int n)
+{
+  int a[4] = { 7, 8, 9, 10 }, b[n], c[3] = { 20, 21, 22 };
+  int *r = a + 1, *q = p - 1, i, err;
+  for (i = 0; i < n; i++)
+    b[i] = 9 + i;
+  #pragma omp target data map(to:a)
+  #pragma omp target data use_device_ptr(r) map(from:err)
+  #pragma omp target is_device_ptr(r) private(i) map(from:err)
+  {
+    err = 0;
+    for (i = 0; i < 4; i++)
+      if (r[i - 1] != 7 + i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+  #pragma omp target data map(to:q[:4])
+  #pragma omp target data use_device_ptr(p) map(from:err)
+  #pragma omp target is_device_ptr(p) private(i) map(from:err)
+  {
+    err = 0;
+    for (i = 0; i < 4; i++)
+      if (p[i - 1] != i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+  #pragma omp target data map(to:b)
+  #pragma omp target data use_device_ptr(b) map(from:err)
+  #pragma omp target is_device_ptr(b) private(i) map(from:err)
+  {
+    err = 0;
+    for (i = 0; i < n; i++)
+      if (b[i] != 9 + i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+  #pragma omp target data map(to:c)
+  #pragma omp target data use_device_ptr(c) map(from:err)
+  #pragma omp target is_device_ptr(c) private(i) map(from:err)
+  {
+    err = 0;
+    for (i = 0; i < 3; i++)
+      if (c[i] != 20 + i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+  #pragma omp target data map(to:s[:5])
+  #pragma omp target data use_device_ptr(s) map(from:err)
+  #pragma omp target is_device_ptr(s) private(i) map(from:err)
+  {
+    err = 0;
+    for (i = 0; i < 5; i++)
+      if (s[i] != 17 + i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+}
+
+int
+main ()
+{
+  int a[4] = { 0, 1, 2, 3 }, b[5] = { 17, 18, 19, 20, 21 };
+  int *p = a + 1;
+  foo (p, b, 9);
+}
diff --git a/libgomp/testsuite/libgomp.c++/taskloop-1.C b/libgomp/testsuite/libgomp.c++/taskloop-1.C
new file mode 100644
index 00000000000..66f8e0b1d7c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/taskloop-1.C
@@ -0,0 +1,4 @@
+// { dg-do run }
+// { dg-options "-O2 -fopenmp" }
+
+#include "../libgomp.c/taskloop-1.c"
diff --git a/libgomp/testsuite/libgomp.c++/taskloop-2.C b/libgomp/testsuite/libgomp.c++/taskloop-2.C
new file mode 100644
index 00000000000..67a0e92717e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/taskloop-2.C
@@ -0,0 +1,6 @@
+// { dg-do run }
+// { dg-options "-O2" }
+// { dg-additional-options "-msse2" { target sse2_runtime } }
+// { dg-additional-options "-mavx" { target avx_runtime } }
+
+#include "../libgomp.c/taskloop-2.c"
diff --git a/libgomp/testsuite/libgomp.c++/taskloop-3.C b/libgomp/testsuite/libgomp.c++/taskloop-3.C
new file mode 100644
index 00000000000..bfd793c1c58
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/taskloop-3.C
@@ -0,0 +1,4 @@
+// { dg-do run }
+// { dg-options "-O2 -fopenmp" }
+
+#include "../libgomp.c/taskloop-3.c"
diff --git a/libgomp/testsuite/libgomp.c++/taskloop-4.C b/libgomp/testsuite/libgomp.c++/taskloop-4.C
new file mode 100644
index 00000000000..937cfcc0029
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/taskloop-4.C
@@ -0,0 +1,4 @@
+// { dg-do run }
+// { dg-options "-O2 -fopenmp" }
+
+#include "../libgomp.c/taskloop-4.c"
diff --git a/libgomp/testsuite/libgomp.c++/taskloop-5.C b/libgomp/testsuite/libgomp.c++/taskloop-5.C
new file mode 100644
index 00000000000..eb464467b66
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/taskloop-5.C
@@ -0,0 +1,73 @@
+#include <omp.h>
+
+__attribute__((noinline, noclone)) void
+foo (int &b)
+{
+#pragma omp parallel
+#pragma omp single
+  {
+    bool f = false;
+  #pragma omp taskloop firstprivate (b, f)
+    for (int i = 0; i < 30; i++)
+      {
+	int q = omp_get_thread_num ();
+	if (!f)
+	  {
+	    if (b != 2)
+	      __builtin_abort ();
+	  }
+	else if (b != 8 * q)
+	  __builtin_abort ();
+	b = 8 * q;
+	f = true;
+      }
+  }
+  int n;
+#pragma omp parallel
+#pragma omp single
+  {
+    bool f = false;
+  #pragma omp taskloop firstprivate (f) lastprivate (b, n)
+    for (int i = 0; i < 30; i++)
+      {
+	int q = omp_get_thread_num ();
+	if (f && b != 8 * q)
+	  __builtin_abort ();
+	b = 8 * q;
+	n = q;
+	f = true;
+      }
+  }
+  if (b != 8 * n)
+    __builtin_abort ();
+  b = 9;
+#pragma omp parallel
+#pragma omp single
+  {
+    bool f = false;
+  #pragma omp taskloop firstprivate (b, f) lastprivate (b, n)
+    for (int i = 0; i < 30; i++)
+      {
+	int q = omp_get_thread_num ();
+	if (!f)
+	  {
+	    if (b != 9)
+	      __builtin_abort ();
+	  }
+	else if (b != 11 * q)
+	  __builtin_abort ();
+	b = 11 * q;
+	n = q;
+	f = true;
+      }
+  }
+  if (b != 11 * n)
+    __builtin_abort ();
+}
+
+int
+main ()
+{
+  int b = 2;
+  foo (b);
+}
diff --git a/libgomp/testsuite/libgomp.c++/taskloop-6.C b/libgomp/testsuite/libgomp.c++/taskloop-6.C
new file mode 100644
index 00000000000..edf7f7a371b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/taskloop-6.C
@@ -0,0 +1,442 @@
+// { dg-do run }
+
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+extern "C" void abort ();
+
+template <typename T>
+class I
+{
+public:
+  typedef ptrdiff_t difference_type;
+  I ();
+  ~I ();
+  I (T *);
+  I (const I &);
+  T &operator * ();
+  T *operator -> ();
+  T &operator [] (const difference_type &) const;
+  I &operator = (const I &);
+  I &operator ++ ();
+  I operator ++ (int);
+  I &operator -- ();
+  I operator -- (int);
+  I &operator += (const difference_type &);
+  I &operator -= (const difference_type &);
+  I operator + (const difference_type &) const;
+  I operator - (const difference_type &) const;
+  template <typename S> friend bool operator == (I<S> &, I<S> &);
+  template <typename S> friend bool operator == (const I<S> &, const I<S> &);
+  template <typename S> friend bool operator < (I<S> &, I<S> &);
+  template <typename S> friend bool operator < (const I<S> &, const I<S> &);
+  template <typename S> friend bool operator <= (I<S> &, I<S> &);
+  template <typename S> friend bool operator <= (const I<S> &, const I<S> &);
+  template <typename S> friend bool operator > (I<S> &, I<S> &);
+  template <typename S> friend bool operator > (const I<S> &, const I<S> &);
+  template <typename S> friend bool operator >= (I<S> &, I<S> &);
+  template <typename S> friend bool operator >= (const I<S> &, const I<S> &);
+  template <typename S> friend typename I<S>::difference_type operator - (I<S> &, I<S> &);
+  template <typename S> friend typename I<S>::difference_type operator - (const I<S> &, const I<S> &);
+  template <typename S> friend I<S> operator + (typename I<S>::difference_type , const I<S> &);
+private:
+  T *p;
+};
+template <typename T> I<T>::I () : p (0) {}
+template <typename T> I<T>::~I () {}
+template <typename T> I<T>::I (T *x) : p (x) {}
+template <typename T> I<T>::I (const I &x) : p (x.p) {}
+template <typename T> T &I<T>::operator * () { return *p; }
+template <typename T> T *I<T>::operator -> () { return p; }
+template <typename T> T &I<T>::operator [] (const difference_type &x) const { return p[x]; }
+template <typename T> I<T> &I<T>::operator = (const I &x) { p = x.p; return *this; }
+template <typename T> I<T> &I<T>::operator ++ () { ++p; return *this; }
+template <typename T> I<T> I<T>::operator ++ (int) { return I (p++); }
+template <typename T> I<T> &I<T>::operator -- () { --p; return *this; }
+template <typename T> I<T> I<T>::operator -- (int) { return I (p--); }
+template <typename T> I<T> &I<T>::operator += (const difference_type &x) { p += x; return *this; }
+template <typename T> I<T> &I<T>::operator -= (const difference_type &x) { p -= x; return *this; }
+template <typename T> I<T> I<T>::operator + (const difference_type &x) const { return I (p + x); }
+template <typename T> I<T> I<T>::operator - (const difference_type &x) const { return I (p - x); }
+template <typename T> bool operator == (I<T> &x, I<T> &y) { return x.p == y.p; }
+template <typename T> bool operator == (const I<T> &x, const I<T> &y) { return x.p == y.p; }
+template <typename T> bool operator != (I<T> &x, I<T> &y) { return !(x == y); }
+template <typename T> bool operator != (const I<T> &x, const I<T> &y) { return !(x == y); }
+template <typename T> bool operator < (I<T> &x, I<T> &y) { return x.p < y.p; }
+template <typename T> bool operator < (const I<T> &x, const I<T> &y) { return x.p < y.p; }
+template <typename T> bool operator <= (I<T> &x, I<T> &y) { return x.p <= y.p; }
+template <typename T> bool operator <= (const I<T> &x, const I<T> &y) { return x.p <= y.p; }
+template <typename T> bool operator > (I<T> &x, I<T> &y) { return x.p > y.p; }
+template <typename T> bool operator > (const I<T> &x, const I<T> &y) { return x.p > y.p; }
+template <typename T> bool operator >= (I<T> &x, I<T> &y) { return x.p >= y.p; }
+template <typename T> bool operator >= (const I<T> &x, const I<T> &y) { return x.p >= y.p; }
+template <typename T> typename I<T>::difference_type operator - (I<T> &x, I<T> &y) { return x.p - y.p; }
+template <typename T> typename I<T>::difference_type operator - (const I<T> &x, const I<T> &y) { return x.p - y.p; }
+template <typename T> I<T> operator + (typename I<T>::difference_type x, const I<T> &y) { return I<T> (x + y.p); }
+
+template <typename T>
+class J
+{
+public:
+  J(const I<T> &x, const I<T> &y) : b (x), e (y) {}
+  const I<T> &begin ();
+  const I<T> &end ();
+private:
+  I<T> b, e;
+};
+
+template <typename T> const I<T> &J<T>::begin () { return b; }
+template <typename T> const I<T> &J<T>::end () { return e; }
+
+int results[2000];
+
+template <typename T>
+void
+baz (I<T> &i)
+{
+  if (*i < 0 || *i >= 2000)
+    abort ();
+  results[*i]++;
+}
+
+void
+f1 (const I<int> &x, const I<int> &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop num_tasks(22)
+  for (I<int> i = x; i <= y; i += 6)
+    baz (i);
+}
+
+void
+f2 (const I<int> &x, const I<int> &y)
+{
+  I<int> i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop grainsize(384) private(i)
+  for (i = x; i < y - 1; i = 1 - 6 + 7 + i)
+    baz (i);
+}
+
+template <typename T>
+void
+f3 (const I<int> &x, const I<int> &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop default(none) firstprivate (x, y)
+  for (I<int> i = x; i <= y; i = i + 9 - 8)
+    baz (i);
+}
+
+template <typename T>
+void
+f4 (const I<int> &x, const I<int> &y)
+{
+  I<int> i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate(i)
+  for (i = x + 2000 - 64; i > y + 10; --i)
+    baz (i);
+}
+
+void
+f5 (const I<int> &x, const I<int> &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (I<int> i = x + 2000 - 64; i > y + 10; i -= 10)
+    baz (i);
+}
+
+template <int N>
+void
+f6 (const I<int> &x, const I<int> &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (I<int> i = x + 2000 - 64; i > y + 10; i = i - 12 + 2)
+    {
+      I<int> j = i + N;
+      baz (j);
+    }
+}
+
+template <int N>
+void
+f7 (I<int> i, const I<int> &x, const I<int> &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop default(none) firstprivate (x, y)
+  for (i = x - 10; i <= y + 10; i += N)
+    baz (i);
+}
+
+template <int N>
+void
+f8 (J<int> j)
+{
+  I<int> i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop default(none) num_tasks(*I<int> (j.begin ())) firstprivate (j)
+  for (i = j.begin (); i <= j.end () + N; i += 2)
+    baz (i);
+}
+
+template <typename T, int N>
+void
+f9 (const I<T> &x, const I<T> &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop grainsize(163)
+  for (I<T> i = x; i <= y; i = i + N)
+    baz (i);
+}
+
+template <typename T, int N>
+void
+f10 (const I<T> &x, const I<T> &y)
+{
+  I<T> i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (i = x; i > y; i = i + N)
+    baz (i);
+}
+
+template <typename T>
+void
+f11 (const T &x, const T &y)
+{
+#pragma omp parallel
+  {
+#pragma omp single nowait
+#pragma omp taskloop nogroup
+    for (T i = x; i <= y; i += 3)
+      baz (i);
+#pragma omp single nowait
+    {
+      T j = y + 3;
+      baz (j);
+    }
+  }
+}
+
+template <typename T>
+void
+f12 (const T &x, const T &y)
+{
+  T i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (i = x; i > y; --i)
+    baz (i);
+}
+
+template <int N>
+struct K
+{
+  template <typename T>
+  static void
+  f13 (const T &x, const T &y)
+  {
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+    for (T i = x; i <= y + N; i += N)
+      baz (i);
+  }
+};
+
+I<int>
+f14 (const I<int> &x, const I<int> &y)
+{
+  I<int> i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate(i)
+  for (i = x; i < y - 1; i = 1 - 6 + 7 + i)
+    baz (i);
+  return i;
+}
+
+template <typename T>
+I<int>
+f15 (const I<int> &x, const I<int> &y)
+{
+  I<int> i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate(i)
+  for (i = x + 2000 - 64; i > y + 10; --i)
+    baz (i);
+  return i;
+}
+
+template <int N>
+I<int>
+f16 (I<int> i, const I<int> &x, const I<int> &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate(i)
+  for (i = x - 10; i <= y + 10; i += N)
+    baz (i);
+  return i;
+}
+
+template <int N>
+I<int>
+f17 (J<int> j)
+{
+  static I<int> i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate(i)
+  for (i = j.begin (); i <= j.end () + N; i += 2)
+    baz (i);
+  return i;
+}
+
+template <typename T, int N>
+I<T>
+f18 (const I<T> &x, const I<T> &y)
+{
+  static I<T> i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate(i)
+  for (i = x; i > y; i = i + N)
+    baz (i);
+  return i;
+}
+
+template <typename T>
+T
+f19 (const T &x, const T &y)
+{
+  T i;
+#pragma omp parallel
+  {
+#pragma omp single nowait
+#pragma omp taskloop nogroup lastprivate(i)
+    for (i = x; i <= y; i += 3)
+      baz (i);
+#pragma omp single nowait
+    {
+      T j = y + 3;
+      baz (j);
+    }
+  }
+  return i;
+}
+
+template <typename T>
+T
+f20 (const T &x, const T &y)
+{
+  T i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate(i)
+  for (i = x; i > y; --i)
+    baz (i);
+  return i;
+}
+
+#define check(expr) \
+  for (int i = 0; i < 2000; i++)			\
+    if (expr)						\
+      {							\
+	if (results[i] != 1)				\
+	  abort ();					\
+	results[i] = 0;					\
+      }							\
+    else if (results[i])				\
+      abort ()
+
+int
+main ()
+{
+  int a[2000];
+  long b[2000];
+  for (int i = 0; i < 2000; i++)
+    {
+      a[i] = i;
+      b[i] = i;
+    }
+  f1 (&a[10], &a[1990]);
+  check (i >= 10 && i <= 1990 && (i - 10) % 6 == 0);
+  f2 (&a[0], &a[1999]);
+  check (i < 1998 && (i & 1) == 0);
+  f3<char> (&a[20], &a[1837]);
+  check (i >= 20 && i <= 1837);
+  f4<int> (&a[0], &a[30]);
+  check (i > 40 && i <= 2000 - 64);
+  f5 (&a[0], &a[100]);
+  check (i >= 116 && i <= 2000 - 64 && (i - 116) % 10 == 0);
+  f6<-10> (&a[10], &a[110]);
+  check (i >= 116 && i <= 2000 - 64 && (i - 116) % 10 == 0);
+  f7<6> (I<int> (), &a[12], &a[1800]);
+  check (i >= 2 && i <= 1808 && (i - 2) % 6 == 0);
+  f8<121> (J<int> (&a[14], &a[1803]));
+  check (i >= 14 && i <= 1924 && (i & 1) == 0);
+  f9<int, 7> (&a[33], &a[1967]);
+  check (i >= 33 && i <= 1967 && (i - 33) % 7 == 0);
+  f10<int, -7> (&a[1939], &a[17]);
+  check (i >= 21 && i <= 1939 && (i - 21) % 7 == 0);
+  f11<I<int> > (&a[16], &a[1981]);
+  check (i >= 16 && i <= 1984 && (i - 16) % 3 == 0);
+  f12<I<int> > (&a[1761], &a[37]);
+  check (i > 37 && i <= 1761);
+  K<5>::f13<I<int> > (&a[1], &a[1935]);
+  check (i >= 1 && i <= 1936 && (i - 1) % 5 == 0);
+  if (f14 (&a[0], &a[1999]) != I<int>(&a[1998]))
+    abort ();
+  check (i < 1998 && (i & 1) == 0);
+  if (f15<int> (&a[0], &a[30]) != I<int>(&a[40]))
+    abort ();
+  check (i > 40 && i <= 2000 - 64);
+  if (f16<6> (I<int> (), &a[12], &a[1800]) != I<int>(&a[1814]))
+    abort ();
+  check (i >= 2 && i <= 1808 && (i - 2) % 6 == 0);
+  if (f17<121> (J<int> (&a[14], &a[1803])) != I<int>(&a[1926]))
+    abort ();
+  check (i >= 14 && i <= 1924 && (i & 1) == 0);
+  if (f18<int, -7> (&a[1939], &a[17]) != I<int>(&a[14]))
+    abort ();
+  check (i >= 21 && i <= 1939 && (i - 21) % 7 == 0);
+  if (f19<I<int> > (&a[16], &a[1981]) != I<int>(&a[1984]))
+    abort ();
+  check (i >= 16 && i <= 1984 && (i - 16) % 3 == 0);
+  if (f20<I<int> > (&a[1761], &a[37]) != I<int>(&a[37]))
+    abort ();
+  check (i > 37 && i <= 1761);
+  f9<long, 7> (&b[33], &b[1967]);
+  check (i >= 33 && i <= 1967 && (i - 33) % 7 == 0);
+  f10<long, -7> (&b[1939], &b[17]);
+  check (i >= 21 && i <= 1939 && (i - 21) % 7 == 0);
+  f11<I<long> > (&b[16], &b[1981]);
+  check (i >= 16 && i <= 1984 && (i - 16) % 3 == 0);
+  f12<I<long> > (&b[1761], &b[37]);
+  check (i > 37 && i <= 1761);
+  K<5>::f13<I<long> > (&b[1], &b[1935]);
+  check (i >= 1 && i <= 1936 && (i - 1) % 5 == 0);
+  if (f18<long, -7> (&b[1939], &b[17]) != I<long>(&b[14]))
+    abort ();
+  check (i >= 21 && i <= 1939 && (i - 21) % 7 == 0);
+  if (f19<I<long> > (&b[16], &b[1981]) != I<long>(&b[1984]))
+    abort ();
+  check (i >= 16 && i <= 1984 && (i - 16) % 3 == 0);
+  if (f20<I<long> > (&b[1761], &b[37]) != I<long>(&b[37]))
+    abort ();
+  check (i > 37 && i <= 1761);
+}
diff --git a/libgomp/testsuite/libgomp.c++/taskloop-7.C b/libgomp/testsuite/libgomp.c++/taskloop-7.C
new file mode 100644
index 00000000000..b9a3c81e381
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/taskloop-7.C
@@ -0,0 +1,400 @@
+// { dg-do run }
+
+#include <vector>
+#include <cstdlib>
+
+template <typename T>
+class J
+{
+public:
+  typedef typename std::vector<T>::const_iterator const_iterator;
+  J(const const_iterator &x, const const_iterator &y) : b (x), e (y) {}
+  const const_iterator &begin ();
+  const const_iterator &end ();
+private:
+  const_iterator b, e;
+};
+
+template <typename T>
+const typename std::vector<T>::const_iterator &J<T>::begin () { return b; }
+template <typename T>
+const typename std::vector<T>::const_iterator &J<T>::end () { return e; }
+
+int results[2000];
+
+template <typename T>
+void
+baz (T &i)
+{
+  if (*i < 0 || *i >= 2000)
+    std::abort ();
+  results[*i]++;
+}
+
+void
+f1 (const std::vector<int>::const_iterator &x,
+    const std::vector<int>::const_iterator &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (std::vector<int>::const_iterator i = x; i <= y; i += 6)
+    baz (i);
+}
+
+void
+f2 (const std::vector<int>::const_iterator &x,
+    const std::vector<int>::const_iterator &y)
+{
+  std::vector<int>::const_iterator i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop private(i)
+  for (i = x; i < y - 1; i = 1 - 6 + 7 + i)
+    baz (i);
+}
+
+template <typename T>
+void
+f3 (const std::vector<int>::const_iterator &x,
+    const std::vector<int>::const_iterator &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (std::vector<int>::const_iterator i = x; i <= y; i = i + 9 - 8)
+    baz (i);
+}
+
+template <typename T>
+void
+f4 (const std::vector<int>::const_iterator &x,
+    const std::vector<int>::const_iterator &y)
+{
+  std::vector<int>::const_iterator i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate(i)
+  for (i = x + 2000 - 64; i > y + 10; --i)
+    baz (i);
+}
+
+void
+f5 (const std::vector<int>::const_iterator &x,
+    const std::vector<int>::const_iterator &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (std::vector<int>::const_iterator i = x + 2000 - 64; i > y + 10; i -= 10)
+    baz (i);
+}
+
+template <int N>
+void
+f6 (const std::vector<int>::const_iterator &x,
+    const std::vector<int>::const_iterator &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (std::vector<int>::const_iterator i = x + 2000 - 64;
+       i > y + 10; i = i - 12 + 2)
+    {
+      std::vector<int>::const_iterator j = i + N;
+      baz (j);
+    }
+}
+
+template <int N>
+void
+f7 (std::vector<int>::const_iterator i,
+    const std::vector<int>::const_iterator &x,
+    const std::vector<int>::const_iterator &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (i = x - 10; i <= y + 10; i += N)
+    baz (i);
+}
+
+template <int N>
+void
+f8 (J<int> j)
+{
+  std::vector<int>::const_iterator i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (i = j.begin (); i <= j.end () + N; i += 2)
+    baz (i);
+}
+
+template <typename T, int N>
+void
+f9 (const typename std::vector<T>::const_iterator &x,
+    const typename std::vector<T>::const_iterator &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (typename std::vector<T>::const_iterator i = x; i <= y; i = i + N)
+    baz (i);
+}
+
+template <typename T, int N>
+void
+f10 (const typename std::vector<T>::const_iterator &x,
+     const typename std::vector<T>::const_iterator &y)
+{
+  typename std::vector<T>::const_iterator i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (i = x; i > y; i = i + N)
+    baz (i);
+}
+
+template <typename T>
+void
+f11 (const T &x, const T &y)
+{
+#pragma omp parallel
+  {
+#pragma omp single nowait
+#pragma omp taskloop nogroup
+    for (T i = x; i <= y; i += 3)
+      baz (i);
+#pragma omp single nowait
+    {
+      T j = y + 3;
+      baz (j);
+    }
+  }
+}
+
+template <typename T>
+void
+f12 (const T &x, const T &y)
+{
+  T i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (i = x; i > y; --i)
+    baz (i);
+}
+
+template <int N>
+struct K
+{
+  template <typename T>
+  static void
+  f13 (const T &x, const T &y)
+  {
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+    for (T i = x; i <= y + N; i += N)
+      baz (i);
+  }
+};
+
+std::vector<int>::const_iterator
+f14 (const std::vector<int>::const_iterator &x,
+     const std::vector<int>::const_iterator &y)
+{
+  std::vector<int>::const_iterator i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate(i)
+  for (i = x; i < y - 1; i = 1 - 6 + 7 + i)
+    baz (i);
+  return i;
+}
+
+template <typename T>
+std::vector<int>::const_iterator
+f15 (const std::vector<int>::const_iterator &x,
+     const std::vector<int>::const_iterator &y)
+{
+  std::vector<int>::const_iterator i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate(i)
+  for (i = x + 2000 - 64; i > y + 10; --i)
+    baz (i);
+  return i;
+}
+
+template <int N>
+std::vector<int>::const_iterator
+f16 (std::vector<int>::const_iterator i,
+     const std::vector<int>::const_iterator &x,
+     const std::vector<int>::const_iterator &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate(i)
+  for (i = x - 10; i <= y + 10; i += N)
+    baz (i);
+  return i;
+}
+
+template <int N>
+std::vector<int>::const_iterator
+f17 (J<int> j)
+{
+  static std::vector<int>::const_iterator i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate(i)
+  for (i = j.begin (); i <= j.end () + N; i += 2)
+    baz (i);
+  return i;
+}
+
+template <typename T, int N>
+typename std::vector<T>::const_iterator
+f18 (const typename std::vector<T>::const_iterator &x,
+     const typename std::vector<T>::const_iterator &y)
+{
+  static typename std::vector<T>::const_iterator i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate(i)
+  for (i = x; i > y; i = i + N)
+    baz (i);
+  return i;
+}
+
+template <typename T>
+T
+f19 (const T &x, const T &y)
+{
+  T i;
+#pragma omp parallel
+  {
+#pragma omp single nowait
+#pragma omp taskloop nogroup lastprivate(i)
+    for (i = x; i <= y; i += 3)
+      baz (i);
+#pragma omp single nowait
+    {
+      T j = y + 3;
+      baz (j);
+    }
+  }
+  return i;
+}
+
+template <typename T>
+T
+f20 (const T &x, const T &y)
+{
+  T i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate(i)
+  for (i = x; i > y; --i)
+    baz (i);
+  return i;
+}
+
+#define check(expr) \
+  for (int i = 0; i < 2000; i++)			\
+    if (expr)						\
+      {							\
+	if (results[i] != 1)				\
+	  std::abort ();				\
+	results[i] = 0;					\
+      }							\
+    else if (results[i])				\
+      std::abort ()
+
+int
+main ()
+{
+  std::vector<int> a(2000);
+  std::vector<long> b(2000);
+  for (int i = 0; i < 2000; i++)
+    {
+      a[i] = i;
+      b[i] = i;
+    }
+  f1 (a.begin () + 10, a.begin () + 1990);
+  check (i >= 10 && i <= 1990 && (i - 10) % 6 == 0);
+  f2 (a.begin () + 0, a.begin () + 1999);
+  check (i < 1998 && (i & 1) == 0);
+  f3<char> (a.begin () + 20, a.begin () + 1837);
+  check (i >= 20 && i <= 1837);
+  f4<int> (a.begin () + 0, a.begin () + 30);
+  check (i > 40 && i <= 2000 - 64);
+  f5 (a.begin () + 0, a.begin () + 100);
+  check (i >= 116 && i <= 2000 - 64 && (i - 116) % 10 == 0);
+  f6<-10> (a.begin () + 10, a.begin () + 110);
+  check (i >= 116 && i <= 2000 - 64 && (i - 116) % 10 == 0);
+  f7<6> (std::vector<int>::const_iterator (), a.begin () + 12,
+	 a.begin () + 1800);
+  check (i >= 2 && i <= 1808 && (i - 2) % 6 == 0);
+  f8<121> (J<int> (a.begin () + 14, a.begin () + 1803));
+  check (i >= 14 && i <= 1924 && (i & 1) == 0);
+  f9<int, 7> (a.begin () + 33, a.begin () + 1967);
+  check (i >= 33 && i <= 1967 && (i - 33) % 7 == 0);
+  f10<int, -7> (a.begin () + 1939, a.begin () + 17);
+  check (i >= 21 && i <= 1939 && (i - 21) % 7 == 0);
+  f11<std::vector<int>::const_iterator > (a.begin () + 16, a.begin () + 1981);
+  check (i >= 16 && i <= 1984 && (i - 16) % 3 == 0);
+  f12<std::vector<int>::const_iterator > (a.begin () + 1761, a.begin () + 37);
+  check (i > 37 && i <= 1761);
+  K<5>::f13<std::vector<int>::const_iterator > (a.begin () + 1,
+						a.begin () + 1935);
+  check (i >= 1 && i <= 1936 && (i - 1) % 5 == 0);
+  if (f14 (a.begin () + 0, a.begin () + 1999) != a.begin () + 1998)
+    std::abort ();
+  check (i < 1998 && (i & 1) == 0);
+  if (f15<int> (a.begin () + 0, a.begin () + 30) != a.begin () + 40)
+    std::abort ();
+  check (i > 40 && i <= 2000 - 64);
+  if (f16<6> (std::vector<int>::const_iterator (), a.begin () + 12,
+	      a.begin () + 1800) != a.begin () + 1814)
+    std::abort ();
+  check (i >= 2 && i <= 1808 && (i - 2) % 6 == 0);
+  if (f17<121> (J<int> (a.begin () + 14, a.begin () + 1803)) != a.begin () + 1926)
+    std::abort ();
+  check (i >= 14 && i <= 1924 && (i & 1) == 0);
+  if (f18<int, -7> (a.begin () + 1939, a.begin () + 17) != a.begin () + 14)
+    std::abort ();
+  check (i >= 21 && i <= 1939 && (i - 21) % 7 == 0);
+  if (f19<std::vector<int>::const_iterator > (a.begin () + 16, a.begin () + 1981)
+      != a.begin () + 1984)
+    std::abort ();
+  check (i >= 16 && i <= 1984 && (i - 16) % 3 == 0);
+  if (f20<std::vector<int>::const_iterator > (a.begin () + 1761, a.begin () + 37)
+      != a.begin () + 37)
+    std::abort ();
+  check (i > 37 && i <= 1761);
+  f9<long, 7> (b.begin () + 33, b.begin () + 1967);
+  check (i >= 33 && i <= 1967 && (i - 33) % 7 == 0);
+  f10<long, -7> (b.begin () + 1939, b.begin () + 17);
+  check (i >= 21 && i <= 1939 && (i - 21) % 7 == 0);
+  f11<std::vector<long>::const_iterator > (b.begin () + 16, b.begin () + 1981);
+  check (i >= 16 && i <= 1984 && (i - 16) % 3 == 0);
+  f12<std::vector<long>::const_iterator > (b.begin () + 1761, b.begin () + 37);
+  check (i > 37 && i <= 1761);
+  K<5>::f13<std::vector<long>::const_iterator > (b.begin () + 1,
+						 b.begin () + 1935);
+  check (i >= 1 && i <= 1936 && (i - 1) % 5 == 0);
+  if (f18<long, -7> (b.begin () + 1939, b.begin () + 17) != b.begin () + 14)
+    std::abort ();
+  check (i >= 21 && i <= 1939 && (i - 21) % 7 == 0);
+  if (f19<std::vector<long>::const_iterator > (b.begin () + 16, b.begin () + 1981)
+      != b.begin () + 1984)
+    std::abort ();
+  check (i >= 16 && i <= 1984 && (i - 16) % 3 == 0);
+  if (f20<std::vector<long>::const_iterator > (b.begin () + 1761, b.begin () + 37)
+      != b.begin () + 37)
+    std::abort ();
+  check (i > 37 && i <= 1761);
+}
diff --git a/libgomp/testsuite/libgomp.c++/taskloop-8.C b/libgomp/testsuite/libgomp.c++/taskloop-8.C
new file mode 100644
index 00000000000..d164907d1d6
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/taskloop-8.C
@@ -0,0 +1,250 @@
+// { dg-do run }
+
+#include <string>
+#include <cstdlib>
+
+template <typename T>
+class J
+{
+public:
+  typedef typename std::basic_string<T>::iterator iterator;
+  J(const iterator &x, const iterator &y) : b (x), e (y) {}
+  const iterator &begin ();
+  const iterator &end ();
+private:
+  iterator b, e;
+};
+
+template <typename T>
+const typename std::basic_string<T>::iterator &J<T>::begin () { return b; }
+template <typename T>
+const typename std::basic_string<T>::iterator &J<T>::end () { return e; }
+
+template <typename T>
+void
+baz (T &i)
+{
+  if (*i < L'a' || *i >= L'a' + 2000)
+    std::abort ();
+  (*i)++;
+}
+
+void
+f1 (const std::basic_string<wchar_t>::iterator &x,
+    const std::basic_string<wchar_t>::iterator &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (std::basic_string<wchar_t>::iterator i = x; i <= y; i += 6)
+    baz (i);
+}
+
+void
+f2 (const std::basic_string<wchar_t>::iterator &x,
+    const std::basic_string<wchar_t>::iterator &y)
+{
+  std::basic_string<wchar_t>::iterator i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop private(i)
+  for (i = x; i < y - 1; i = 1 - 6 + 7 + i)
+    baz (i);
+}
+
+template <typename T>
+void
+f3 (const std::basic_string<wchar_t>::iterator &x,
+    const std::basic_string<wchar_t>::iterator &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (std::basic_string<wchar_t>::iterator i = x; i <= y; i = i + 9 - 8)
+    baz (i);
+}
+
+template <typename T>
+void
+f4 (const std::basic_string<wchar_t>::iterator &x,
+    const std::basic_string<wchar_t>::iterator &y)
+{
+  std::basic_string<wchar_t>::iterator i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate(i)
+  for (i = x + 2000 - 64; i > y + 10; --i)
+    baz (i);
+}
+
+void
+f5 (const std::basic_string<wchar_t>::iterator &x,
+    const std::basic_string<wchar_t>::iterator &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (std::basic_string<wchar_t>::iterator i = x + 2000 - 64;
+       i > y + 10; i -= 10)
+    baz (i);
+}
+
+template <int N>
+void
+f6 (const std::basic_string<wchar_t>::iterator &x,
+    const std::basic_string<wchar_t>::iterator &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (std::basic_string<wchar_t>::iterator i = x + 2000 - 64;
+       i > y + 10; i = i - 12 + 2)
+    {
+      std::basic_string<wchar_t>::iterator j = i + N;
+      baz (j);
+    }
+}
+
+template <int N>
+void
+f7 (std::basic_string<wchar_t>::iterator i,
+    const std::basic_string<wchar_t>::iterator &x,
+    const std::basic_string<wchar_t>::iterator &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (i = x - 10; i <= y + 10; i += N)
+    baz (i);
+}
+
+template <wchar_t N>
+void
+f8 (J<wchar_t> j)
+{
+  std::basic_string<wchar_t>::iterator i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (i = j.begin (); i <= j.end () + N; i += 2)
+    baz (i);
+}
+
+template <typename T, int N>
+void
+f9 (const typename std::basic_string<T>::iterator &x,
+    const typename std::basic_string<T>::iterator &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (typename std::basic_string<T>::iterator i = x; i <= y; i = i + N)
+    baz (i);
+}
+
+template <typename T, int N>
+void
+f10 (const typename std::basic_string<T>::iterator &x,
+     const typename std::basic_string<T>::iterator &y)
+{
+  typename std::basic_string<T>::iterator i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+  for (i = x; i > y; i = i + N)
+    baz (i);
+}
+
+template <typename T>
+void
+f11 (const T &x, const T &y)
+{
+#pragma omp parallel
+  {
+#pragma omp single nowait
+#pragma omp taskloop nogroup
+    for (T i = x; i <= y; i += 3)
+      baz (i);
+#pragma omp single nowait
+    {
+      T j = y + 3;
+      baz (j);
+    }
+  }
+}
+
+template <typename T>
+void
+f12 (const T &x, const T &y)
+{
+  T i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop private(i)
+  for (i = x; i > y; --i)
+    baz (i);
+}
+
+template <int N>
+struct K
+{
+  template <typename T>
+  static void
+  f13 (const T &x, const T &y)
+  {
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop
+    for (T i = x; i <= y + N; i += N)
+      baz (i);
+  }
+};
+
+#define check(expr) \
+  for (int i = 0; i < 2000; i++)			\
+    if (expr)						\
+      {							\
+	if (a[i] != L'a' + i + 1)			\
+	  std::abort ();				\
+	a[i] = L'a' + i;				\
+      }							\
+    else if (a[i] != L'a' + i)				\
+      std::abort ()
+
+int
+main ()
+{
+  std::basic_string<wchar_t> a = L"";
+  for (int i = 0; i < 2000; i++)
+    a += L'a' + i;
+  f1 (a.begin () + 10, a.begin () + 1990);
+  check (i >= 10 && i <= 1990 && (i - 10) % 6 == 0);
+  f2 (a.begin () + 0, a.begin () + 1999);
+  check (i < 1998 && (i & 1) == 0);
+  f3<char> (a.begin () + 20, a.begin () + 1837);
+  check (i >= 20 && i <= 1837);
+  f4<int> (a.begin () + 0, a.begin () + 30);
+  check (i > 40 && i <= 2000 - 64);
+  f5 (a.begin () + 0, a.begin () + 100);
+  check (i >= 116 && i <= 2000 - 64 && (i - 116) % 10 == 0);
+  f6<-10> (a.begin () + 10, a.begin () + 110);
+  check (i >= 116 && i <= 2000 - 64 && (i - 116) % 10 == 0);
+  f7<6> (std::basic_string<wchar_t>::iterator (), a.begin () + 12,
+	 a.begin () + 1800);
+  check (i >= 2 && i <= 1808 && (i - 2) % 6 == 0);
+  f8<121> (J<wchar_t> (a.begin () + 14, a.begin () + 1803));
+  check (i >= 14 && i <= 1924 && (i & 1) == 0);
+  f9<wchar_t, 7> (a.begin () + 33, a.begin () + 1967);
+  check (i >= 33 && i <= 1967 && (i - 33) % 7 == 0);
+  f10<wchar_t, -7> (a.begin () + 1939, a.begin () + 17);
+  check (i >= 21 && i <= 1939 && (i - 21) % 7 == 0);
+  f11<std::basic_string<wchar_t>::iterator > (a.begin () + 16,
+					      a.begin () + 1981);
+  check (i >= 16 && i <= 1984 && (i - 16) % 3 == 0);
+  f12<std::basic_string<wchar_t>::iterator > (a.begin () + 1761,
+					      a.begin () + 37);
+  check (i > 37 && i <= 1761);
+  K<5>::f13<std::basic_string<wchar_t>::iterator > (a.begin () + 1,
+						    a.begin () + 1935);
+  check (i >= 1 && i <= 1936 && (i - 1) % 5 == 0);
+}
diff --git a/libgomp/testsuite/libgomp.c++/taskloop-9.C b/libgomp/testsuite/libgomp.c++/taskloop-9.C
new file mode 100644
index 00000000000..65abc31ff8d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c++/taskloop-9.C
@@ -0,0 +1,323 @@
+// { dg-do run }
+
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+extern "C" void abort ();
+
+template <typename T>
+class I
+{
+public:
+  typedef ptrdiff_t difference_type;
+  I ();
+  ~I ();
+  I (T *);
+  I (const I &);
+  T &operator * ();
+  T *operator -> ();
+  T &operator [] (const difference_type &) const;
+  I &operator = (const I &);
+  I &operator ++ ();
+  I operator ++ (int);
+  I &operator -- ();
+  I operator -- (int);
+  I &operator += (const difference_type &);
+  I &operator -= (const difference_type &);
+  I operator + (const difference_type &) const;
+  I operator - (const difference_type &) const;
+  template <typename S> friend bool operator == (I<S> &, I<S> &);
+  template <typename S> friend bool operator == (const I<S> &, const I<S> &);
+  template <typename S> friend bool operator < (I<S> &, I<S> &);
+  template <typename S> friend bool operator < (const I<S> &, const I<S> &);
+  template <typename S> friend bool operator <= (I<S> &, I<S> &);
+  template <typename S> friend bool operator <= (const I<S> &, const I<S> &);
+  template <typename S> friend bool operator > (I<S> &, I<S> &);
+  template <typename S> friend bool operator > (const I<S> &, const I<S> &);
+  template <typename S> friend bool operator >= (I<S> &, I<S> &);
+  template <typename S> friend bool operator >= (const I<S> &, const I<S> &);
+  template <typename S> friend typename I<S>::difference_type operator - (I<S> &, I<S> &);
+  template <typename S> friend typename I<S>::difference_type operator - (const I<S> &, const I<S> &);
+  template <typename S> friend I<S> operator + (typename I<S>::difference_type , const I<S> &);
+private:
+  T *p;
+};
+template <typename T> I<T>::I () : p (0) {}
+template <typename T> I<T>::~I () { p = (T *) 0; }
+template <typename T> I<T>::I (T *x) : p (x) {}
+template <typename T> I<T>::I (const I &x) : p (x.p) {}
+template <typename T> T &I<T>::operator * () { return *p; }
+template <typename T> T *I<T>::operator -> () { return p; }
+template <typename T> T &I<T>::operator [] (const difference_type &x) const { return p[x]; }
+template <typename T> I<T> &I<T>::operator = (const I &x) { p = x.p; return *this; }
+template <typename T> I<T> &I<T>::operator ++ () { ++p; return *this; }
+template <typename T> I<T> I<T>::operator ++ (int) { return I (p++); }
+template <typename T> I<T> &I<T>::operator -- () { --p; return *this; }
+template <typename T> I<T> I<T>::operator -- (int) { return I (p--); }
+template <typename T> I<T> &I<T>::operator += (const difference_type &x) { p += x; return *this; }
+template <typename T> I<T> &I<T>::operator -= (const difference_type &x) { p -= x; return *this; }
+template <typename T> I<T> I<T>::operator + (const difference_type &x) const { return I (p + x); }
+template <typename T> I<T> I<T>::operator - (const difference_type &x) const { return I (p - x); }
+template <typename T> bool operator == (I<T> &x, I<T> &y) { return x.p == y.p; }
+template <typename T> bool operator == (const I<T> &x, const I<T> &y) { return x.p == y.p; }
+template <typename T> bool operator != (I<T> &x, I<T> &y) { return !(x == y); }
+template <typename T> bool operator != (const I<T> &x, const I<T> &y) { return !(x == y); }
+template <typename T> bool operator < (I<T> &x, I<T> &y) { return x.p < y.p; }
+template <typename T> bool operator < (const I<T> &x, const I<T> &y) { return x.p < y.p; }
+template <typename T> bool operator <= (I<T> &x, I<T> &y) { return x.p <= y.p; }
+template <typename T> bool operator <= (const I<T> &x, const I<T> &y) { return x.p <= y.p; }
+template <typename T> bool operator > (I<T> &x, I<T> &y) { return x.p > y.p; }
+template <typename T> bool operator > (const I<T> &x, const I<T> &y) { return x.p > y.p; }
+template <typename T> bool operator >= (I<T> &x, I<T> &y) { return x.p >= y.p; }
+template <typename T> bool operator >= (const I<T> &x, const I<T> &y) { return x.p >= y.p; }
+template <typename T> typename I<T>::difference_type operator - (I<T> &x, I<T> &y) { return x.p - y.p; }
+template <typename T> typename I<T>::difference_type operator - (const I<T> &x, const I<T> &y) { return x.p - y.p; }
+template <typename T> I<T> operator + (typename I<T>::difference_type x, const I<T> &y) { return I<T> (x + y.p); }
+
+template <typename T>
+class J
+{
+public:
+  J(const I<T> &x, const I<T> &y) : b (x), e (y) {}
+  const I<T> &begin ();
+  const I<T> &end ();
+private:
+  I<T> b, e;
+};
+
+template <typename T> const I<T> &J<T>::begin () { return b; }
+template <typename T> const I<T> &J<T>::end () { return e; }
+
+int results[2000];
+
+template <typename T>
+void
+baz (I<T> &i)
+{
+  if (*i < 0 || *i >= 2000)
+    abort ();
+  results[*i]++;
+}
+
+I<int>
+f1 (const I<int> &x, const I<int> &y)
+{
+  I<int> i;
+#pragma omp parallel shared (i)
+  {
+  #pragma omp single
+  #pragma omp taskloop lastprivate (i)
+    for (i = x; i < y - 1; ++i)
+      baz (i);
+  #pragma omp single
+    i += 3;
+  }
+  return I<int> (i);
+}
+
+I<int>
+f2 (const I<int> &x, const I<int> &y)
+{
+  I<int> i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate (i)
+  for (i = x; i < y - 1; i = 1 - 6 + 7 + i)
+    baz (i);
+  return I<int> (i);
+}
+
+template <typename T>
+I<int>
+f3 (const I<int> &x, const I<int> &y)
+{
+  I<int> i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate (i)
+    for (i = x + 1000 - 64; i <= y - 10; i++)
+      baz (i);
+  return i;
+}
+
+template <typename T>
+I<int>
+f4 (const I<int> &x, const I<int> &y)
+{
+  I<int> i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate (i)
+  for (i = x + 2000 - 64; i > y + 10; --i)
+    baz (i);
+  return I<int> (i);
+}
+
+template <typename T>
+I<int>
+f5 (const I<int> &x, const I<int> &y)
+{
+  I<int> i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate (i)
+  for (i = x; i > y + T (6); i--)
+    baz (i);
+  return i;
+}
+
+template <typename T>
+I<int>
+f6 (const I<int> &x, const I<int> &y)
+{
+  I<int> i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate (i)
+  for (i = x - T (7); i > y; i -= T (2))
+    baz (i);
+  return I<int> (i);
+}
+
+template <int N>
+I<int>
+f7 (I<int> i, const I<int> &x, const I<int> &y)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate (i)
+  for (i = x - 10; i <= y + 10; i += N)
+    baz (i);
+  return I<int> (i);
+}
+
+template <int N>
+I<int>
+f8 (J<int> j)
+{
+  I<int> i;
+#pragma omp parallel shared (i)
+  #pragma omp single
+  #pragma omp taskloop lastprivate (i)
+    for (i = j.begin (); i <= j.end () + N; i += 2)
+      baz (i);
+  return i;
+}
+
+I<int> i9;
+
+template <long N>
+I<int> &
+f9 (J<int> j)
+{
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate (i9)
+  for (i9 = j.begin () + N; i9 <= j.end () - N; i9 = i9 - N)
+    baz (i9);
+  return i9;
+}
+
+template <typename T, int N>
+I<T>
+f10 (const I<T> &x, const I<T> &y)
+{
+  I<T> i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate (i)
+  for (i = x; i > y; i = i + N)
+    baz (i);
+  return i;
+}
+
+template <typename T, typename U>
+T
+f11 (T i, const T &x, const T &y)
+{
+#pragma omp parallel
+  #pragma omp single
+  #pragma omp taskloop lastprivate (i)
+  for (i = x + U (2); i <= y + U (1); i = U (2) + U (3) + i)
+    baz (i);
+  return T (i);
+}
+
+template <typename T>
+T
+f12 (const T &x, const T &y)
+{
+  T i;
+#pragma omp parallel
+#pragma omp single
+#pragma omp taskloop lastprivate (i)
+  for (i = x; i > y; --i)
+    baz (i);
+  return i;
+}
+
+#define check(expr) \
+  for (int i = 0; i < 2000; i++)			\
+    if (expr)						\
+      {							\
+	if (results[i] != 1)				\
+	  abort ();					\
+	results[i] = 0;					\
+      }							\
+    else if (results[i])				\
+      abort ()
+
+int
+main ()
+{
+  int a[2000];
+  long b[2000];
+  for (int i = 0; i < 2000; i++)
+    {
+      a[i] = i;
+      b[i] = i;
+    }
+  if (*f1 (&a[10], &a[1873]) != 1875)
+    abort ();
+  check (i >= 10 && i < 1872);
+  if (*f2 (&a[0], &a[1998]) != 1998)
+    abort ();
+  check (i < 1997 && (i & 1) == 0);
+  if (*f3<int> (&a[10], &a[1971]) != 1962)
+    abort ();
+  check (i >= 946 && i <= 1961);
+  if (*f4<int> (&a[0], &a[30]) != 40)
+    abort ();
+  check (i > 40 && i <= 2000 - 64);
+  if (*f5<short> (&a[1931], &a[17]) != 23)
+    abort ();
+  check (i > 23 && i <= 1931);
+  if (*f6<long> (&a[1931], &a[17]) != 16)
+    abort ();
+  check (i > 17 && i <= 1924 && (i & 1) == 0);
+  if (*f7<6> (I<int> (), &a[12], &a[1800]) != 1814)
+    abort ();
+  check (i >= 2 && i <= 1808 && (i - 2) % 6 == 0);
+  if (*f8<121> (J<int> (&a[14], &a[1803])) != 1926)
+    abort ();
+  check (i >= 14 && i <= 1924 && (i & 1) == 0);
+  if (*f9<-3L> (J<int> (&a[27], &a[1761])) != 1767)
+    abort ();
+  check (i >= 24 && i <= 1764 && (i % 3) == 0);
+  if (*f10<int, -7> (&a[1939], &a[17]) != 14)
+    abort ();
+  check (i >= 21 && i <= 1939 && i % 7 == 0);
+  if (*f11<I<int>, short> (I<int> (), &a[71], &a[1941]) != 1943)
+    abort ();
+  check (i >= 73 && i <= 1938 && (i - 73) % 5 == 0);
+  if (*f12<I<int> > (&a[1761], &a[37]) != 37)
+    abort ();
+  check (i > 37 && i <= 1761);
+  if (*f10<long, -7> (&b[1939], &b[17]) != 14)
+    abort ();
+  check (i >= 21 && i <= 1939 && i % 7 == 0);
+  if (*f11<I<long>, short> (I<long> (), &b[71], &b[1941]) != 1943)
+    abort ();
+  check (i >= 73 && i <= 1938 && (i - 73) % 5 == 0);
+  if (*f12<I<long> > (&b[1761], &b[37]) != 37)
+    abort ();
+  check (i > 37 && i <= 1761);
+}
diff --git a/libgomp/testsuite/libgomp.c/affinity-2.c b/libgomp/testsuite/libgomp.c/affinity-2.c
new file mode 100644
index 00000000000..f8216574704
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/affinity-2.c
@@ -0,0 +1,89 @@
+/* { dg-do run } */
+/* { dg-set-target-env-var OMP_PROC_BIND "spread,close" } */
+/* { dg-set-target-env-var OMP_PLACES "{6,7}:4:-2,!{2,3}" } */
+/* { dg-set-target-env-var OMP_NUM_THREADS "2" } */
+
+#include <omp.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+int *
+get_buf (int nump)
+{
+  static int *buf;
+  static size_t buf_size;
+  if ((size_t) nump > buf_size)
+    {
+      buf_size *= 2;
+      if (nump > buf_size)
+	buf_size = nump + 64;
+      int *bufn = realloc (buf, buf_size * sizeof (int));
+      if (bufn == NULL)
+	{
+	  fprintf (stderr, "memory allocation error\n");
+	  exit (1);
+	}
+      buf = bufn;
+    }
+  return buf;
+}
+
+void
+print_place (int count, int *ids)
+{
+  int i, j;
+  printf ("{");
+  for (i = 0; i < count; i++)
+    {
+      for (j = i + 1; j < count; j++)
+	if (ids[j] != ids[i] + (j - i))
+	  break;
+      if (i)
+	printf (",");
+      if (j == i + 1)
+	printf ("%d", ids[i]);
+      else
+	{
+	  printf ("%d:%d", ids[i], j - i);
+	  i = j - 1;
+	}
+    }
+  printf ("}\n");
+}
+
+void
+print_place_var (void)
+{
+  int place = omp_get_place_num ();
+  int num_places = omp_get_partition_num_places ();
+  int *ids = get_buf (num_places);
+  omp_get_partition_place_nums (ids);
+  printf ("place %d\n", place);
+  if (num_places)
+    printf ("partition %d-%d\n", ids[0], ids[num_places - 1]);
+}
+
+int
+main ()
+{
+  int i, num = omp_get_num_places (), nump, *ids;
+  printf ("omp_get_num_places () == %d\n", num);
+  for (i = 0; i < num; i++)
+    {
+      printf ("place %d ", i);
+      nump = omp_get_place_num_procs (i);
+      ids = get_buf (nump);
+      omp_get_place_proc_ids (i, ids);
+      print_place (nump, ids);
+    }
+  print_place_var ();
+  omp_set_nested (1);
+  #pragma omp parallel
+    if (omp_get_thread_num () == omp_get_num_threads () - 1)
+      {
+      #pragma omp parallel
+	if (omp_get_thread_num () == omp_get_num_threads () - 1)
+	  print_place_var ();
+      }
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/doacross-1.c b/libgomp/testsuite/libgomp.c/doacross-1.c
new file mode 100644
index 00000000000..0794c80ec2e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/doacross-1.c
@@ -0,0 +1,181 @@
+extern void abort (void);
+
+#define N 256
+int a[N], b[N / 16][8][4], c[N / 32][8][8];
+volatile int d, e;
+
+int
+main ()
+{
+  int i, j, k, l, m;
+  #pragma omp parallel private (l)
+  {
+    #pragma omp for schedule(static, 1) ordered (1) nowait
+    for (i = 0; i < N; i++)
+      {
+	#pragma omp atomic write
+	a[i] = 1;
+	#pragma omp ordered depend(sink: i - 1)
+	if (i)
+	  {
+	    #pragma omp atomic read
+	    l = a[i - 1];
+	    if (l < 2)
+	      abort ();
+	  }
+	#pragma omp atomic write
+	a[i] = 2;
+	if (i < N - 1)
+	  {
+	    #pragma omp atomic read
+	    l = a[i + 1];
+	    if (l == 3)
+	      abort ();
+	  }
+	#pragma omp ordered depend(source)
+	#pragma omp atomic write
+	a[i] = 3;
+      }
+    #pragma omp for schedule(static, 0) ordered (3) nowait
+    for (i = 2; i < N / 16 - 1; i++)
+      for (j = 0; j < 8; j += 2)
+	for (k = 1; k <= 3; k++)
+	  {
+	    #pragma omp atomic write
+	    b[i][j][k] = 1;
+	    #pragma omp ordered depend(sink: i, j - 2, k - 1) \
+				depend(sink: i - 2, j - 2, k + 1)
+	    #pragma omp ordered depend(sink: i - 3, j + 2, k - 2)
+	    if (j >= 2 && k > 1)
+	      {
+		#pragma omp atomic read
+		l = b[i][j - 2][k - 1];
+		if (l < 2)
+		  abort ();
+	      }
+	    #pragma omp atomic write
+	    b[i][j][k] = 2;
+	    if (i >= 4 && j >= 2 && k < 3)
+	      {
+		#pragma omp atomic read
+		l = b[i - 2][j - 2][k + 1];
+		if (l < 2)
+		  abort ();
+	      }
+	    if (i >= 5 && j < N / 16 - 3 && k == 3)
+	      {
+		#pragma omp atomic read
+		l = b[i - 3][j + 2][k - 2];
+		if (l < 2)
+		  abort ();
+	      }
+	    #pragma omp ordered depend(source)
+	    #pragma omp atomic write
+	    b[i][j][k] = 3;
+	  }
+#define A(n) int n;
+#define B(n) A(n##0) A(n##1) A(n##2) A(n##3)
+#define C(n) B(n##0) B(n##1) B(n##2) B(n##3)
+#define D(n) C(n##0) C(n##1) C(n##2) C(n##3)
+    D(m)
+#undef A
+    #pragma omp for collapse (2) ordered(61) schedule(dynamic, 15)
+    for (i = 0; i < N / 32; i++)
+      for (j = 7; j > 1; j--)
+	for (k = 6; k >= 0; k -= 2)
+#define A(n) for (n = 4; n < 5; n++)
+	  D(m)
+#undef A
+	    {
+	      #pragma omp atomic write
+	      c[i][j][k] = 1;
+#define A(n) ,n
+#define E(n) C(n##0) C(n##1) C(n##2) B(n##30) B(n##31) A(n##320) A(n##321)
+	      #pragma omp ordered depend (sink: i, j, k + 2 E(m)) \
+				  depend (sink:i - 2, j + 1, k - 4 E(m)) \
+				  depend(sink: i - 1, j - 2, k - 2 E(m))
+	      if (k <= 4)
+		{
+		  l = c[i][j][k + 2];
+		  if (l < 2)
+		    abort ();
+		}
+	      #pragma omp atomic write
+	      c[i][j][k] = 2;
+	      if (i >= 2 && j < 7 && k >= 4)
+		{
+		  l = c[i - 2][j + 1][k - 4];
+		  if (l < 2)
+		    abort ();
+		}
+	      if (i >= 1 && j >= 4 && k >= 2)
+		{
+		  l = c[i - 1][j - 2][k - 2];
+		  if (l < 2)
+		    abort ();
+		}
+	      #pragma omp ordered depend (source)
+	      #pragma omp atomic write
+	      c[i][j][k] = 3;
+	    }
+
+    #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k)
+    for (i = 0; i < d + 1; i++)
+      for (j = d + 1; j >= 0; j--)
+	for (k = 0; k < d; k++)
+	  for (l = 0; l < d + 2; l++)
+	    {
+	      #pragma omp ordered depend (source)
+	      #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l)
+	      if (!e)
+		abort ();
+	    }
+    #pragma omp single
+    {
+      if (i != 1 || j != -1 || k != 0)
+	abort ();
+      i = 8; j = 9; k = 10;
+    }
+    #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k, m)
+    for (i = 0; i < d + 1; i++)
+      for (j = d + 1; j >= 0; j--)
+	for (k = 0; k < d + 2; k++)
+	  for (m = 0; m < d; m++)
+	    {
+	      #pragma omp ordered depend (source)
+	      #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, m)
+	      abort ();
+	    }
+    #pragma omp single
+    if (i != 1 || j != -1 || k != 2 || m != 0)
+      abort ();
+    #pragma omp for collapse(2) ordered(4) nowait
+    for (i = 0; i < d + 1; i++)
+      for (j = d; j > 0; j--)
+	for (k = 0; k < d + 2; k++)
+	  for (l = 0; l < d + 4; l++)
+	    {
+	      #pragma omp ordered depend (source)
+	      #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l)
+	      if (!e)
+		abort ();
+	    }
+    #pragma omp for nowait
+    for (i = 0; i < N; i++)
+      if (a[i] != 3)
+	abort ();
+    #pragma omp for collapse(2) private(k) nowait
+    for (i = 0; i < N / 16; i++)
+      for (j = 0; j < 8; j++)
+	for (k = 0; k < 4; k++)
+	  if (b[i][j][k] != 3 * (i >= 2 && i < N / 16 - 1 && (j & 1) == 0 && k >= 1))
+	    abort ();
+    #pragma omp for collapse(3) nowait
+    for (i = 0; i < N / 32; i++)
+      for (j = 0; j < 8; j++)
+	for (k = 0; k < 8; k++)
+	  if (c[i][j][k] != 3 * (j >= 2 && (k & 1) == 0))
+	    abort ();
+  }
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/doacross-2.c b/libgomp/testsuite/libgomp.c/doacross-2.c
new file mode 100644
index 00000000000..e491bb22965
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/doacross-2.c
@@ -0,0 +1,225 @@
+extern void abort (void);
+
+#define N 256
+int a[N], b[N / 16][8][4], c[N / 32][8][8], g[N / 16][8][6];
+volatile int d, e;
+volatile unsigned long long f;
+
+int
+main ()
+{
+  unsigned long long i;
+  int j, k, l, m;
+  #pragma omp parallel private (l)
+  {
+    #pragma omp for schedule(static, 1) ordered (1) nowait
+    for (i = 1; i < N + f; i++)
+      {
+	#pragma omp atomic write
+	a[i] = 1;
+	#pragma omp ordered depend(sink: i - 1)
+	if (i > 1)
+	  {
+	    #pragma omp atomic read
+	    l = a[i - 1];
+	    if (l < 2)
+	      abort ();
+	  }
+	#pragma omp atomic write
+	a[i] = 2;
+	if (i < N - 1)
+	  {
+	    #pragma omp atomic read
+	    l = a[i + 1];
+	    if (l == 3)
+	      abort ();
+	  }
+	#pragma omp ordered depend(source)
+	#pragma omp atomic write
+	a[i] = 3;
+      }
+    #pragma omp for schedule(static, 0) ordered (3) nowait
+    for (i = 3; i < N / 16 - 1 + f; i++)
+      for (j = 0; j < 8; j += 2)
+	for (k = 1; k <= 3; k++)
+	  {
+	    #pragma omp atomic write
+	    b[i][j][k] = 1;
+	    #pragma omp ordered depend(sink: i, j - 2, k - 1) \
+				depend(sink: i - 2, j - 2, k + 1)
+	    #pragma omp ordered depend(sink: i - 3, j + 2, k - 2)
+	    if (j >= 2 && k > 1)
+	      {
+		#pragma omp atomic read
+		l = b[i][j - 2][k - 1];
+		if (l < 2)
+		  abort ();
+	      }
+	    #pragma omp atomic write
+	    b[i][j][k] = 2;
+	    if (i >= 5 && j >= 2 && k < 3)
+	      {
+		#pragma omp atomic read
+		l = b[i - 2][j - 2][k + 1];
+		if (l < 2)
+		  abort ();
+	      }
+	    if (i >= 6 && j < N / 16 - 3 && k == 3)
+	      {
+		#pragma omp atomic read
+		l = b[i - 3][j + 2][k - 2];
+		if (l < 2)
+		  abort ();
+	      }
+	    #pragma omp ordered depend(source)
+	    #pragma omp atomic write
+	    b[i][j][k] = 3;
+	  }
+#define A(n) int n;
+#define B(n) A(n##0) A(n##1) A(n##2) A(n##3)
+#define C(n) B(n##0) B(n##1) B(n##2) B(n##3)
+#define D(n) C(n##0) C(n##1) C(n##2) C(n##3)
+    D(m)
+#undef A
+    #pragma omp for collapse (2) ordered(61) schedule(dynamic, 15)
+    for (i = 2; i < N / 32 + f; i++)
+      for (j = 7; j > 1; j--)
+	for (k = 6; k >= 0; k -= 2)
+#define A(n) for (n = 4; n < 5; n++)
+	  D(m)
+#undef A
+	    {
+	      #pragma omp atomic write
+	      c[i][j][k] = 1;
+#define A(n) ,n
+#define E(n) C(n##0) C(n##1) C(n##2) B(n##30) B(n##31) A(n##320) A(n##321)
+	      #pragma omp ordered depend (sink: i, j, k + 2 E(m)) \
+				  depend (sink:i - 2, j + 1, k - 4 E(m)) \
+				  depend(sink: i - 1, j - 2, k - 2 E(m))
+	      if (k <= 4)
+		{
+		  l = c[i][j][k + 2];
+		  if (l < 2)
+		    abort ();
+		}
+	      #pragma omp atomic write
+	      c[i][j][k] = 2;
+	      if (i >= 4 && j < 7 && k >= 4)
+		{
+		  l = c[i - 2][j + 1][k - 4];
+		  if (l < 2)
+		    abort ();
+		}
+	      if (i >= 3 && j >= 4 && k >= 2)
+		{
+		  l = c[i - 1][j - 2][k - 2];
+		  if (l < 2)
+		    abort ();
+		}
+	      #pragma omp ordered depend (source)
+	      #pragma omp atomic write
+	      c[i][j][k] = 3;
+	    }
+    #pragma omp for schedule(static, 0) ordered (3) nowait
+    for (j = 0; j < N / 16 - 1; j++)
+      for (k = 0; k < 8; k += 2)
+	for (i = 3; i <= 5 + f; i++)
+	  {
+	    #pragma omp atomic write
+	    g[j][k][i] = 1;
+	    #pragma omp ordered depend(sink: j, k - 2, i - 1) \
+				depend(sink: j - 2, k - 2, i + 1)
+	    #pragma omp ordered depend(sink: j - 3, k + 2, i - 2)
+	    if (k >= 2 && i > 3)
+	      {
+		#pragma omp atomic read
+		l = g[j][k - 2][i - 1];
+		if (l < 2)
+		  abort ();
+	      }
+	    #pragma omp atomic write
+	    g[j][k][i] = 2;
+	    if (j >= 2 && k >= 2 && i < 5)
+	      {
+		#pragma omp atomic read
+		l = g[j - 2][k - 2][i + 1];
+		if (l < 2)
+		  abort ();
+	      }
+	    if (j >= 3 && k < N / 16 - 3 && i == 5)
+	      {
+		#pragma omp atomic read
+		l = g[j - 3][k + 2][i - 2];
+		if (l < 2)
+		  abort ();
+	      }
+	    #pragma omp ordered depend(source)
+	    #pragma omp atomic write
+	    g[j][k][i] = 3;
+	  }
+    #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k)
+    for (i = 2; i < f + 3; i++)
+      for (j = d + 1; j >= 0; j--)
+	for (k = 0; k < d; k++)
+	  for (l = 0; l < d + 2; l++)
+	    {
+	      #pragma omp ordered depend (source)
+	      #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l)
+	      if (!e)
+		abort ();
+	    }
+    #pragma omp single
+    {
+      if (i != 3 || j != -1 || k != 0)
+	abort ();
+      i = 8; j = 9; k = 10;
+    }
+    #pragma omp for collapse(2) ordered(4) lastprivate (i, j, k, m)
+    for (i = 2; i < f + 3; i++)
+      for (j = d + 1; j >= 0; j--)
+	for (k = 0; k < d + 2; k++)
+	  for (m = 0; m < d; m++)
+	    {
+	      #pragma omp ordered depend (source)
+	      #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, m)
+	      abort ();
+	    }
+    #pragma omp single
+    if (i != 3 || j != -1 || k != 2 || m != 0)
+      abort ();
+    #pragma omp for collapse(2) ordered(4) nowait
+    for (i = 2; i < f + 3; i++)
+      for (j = d; j > 0; j--)
+	for (k = 0; k < d + 2; k++)
+	  for (l = 0; l < d + 4; l++)
+	    {
+	      #pragma omp ordered depend (source)
+	      #pragma omp ordered depend (sink:i - 2, j + 2, k - 2, l)
+	      if (!e)
+		abort ();
+	    }
+    #pragma omp for nowait
+    for (i = 0; i < N; i++)
+      if (a[i] != 3 * (i >= 1))
+	abort ();
+    #pragma omp for collapse(2) private(k) nowait
+    for (i = 0; i < N / 16; i++)
+      for (j = 0; j < 8; j++)
+	for (k = 0; k < 4; k++)
+	  if (b[i][j][k] != 3 * (i >= 3 && i < N / 16 - 1 && (j & 1) == 0 && k >= 1))
+	    abort ();
+    #pragma omp for collapse(3) nowait
+    for (i = 0; i < N / 32; i++)
+      for (j = 0; j < 8; j++)
+	for (k = 0; k < 8; k++)
+	  if (c[i][j][k] != 3 * (i >= 2 && j >= 2 && (k & 1) == 0))
+	    abort ();
+    #pragma omp for collapse(2) private(k) nowait
+    for (i = 0; i < N / 16; i++)
+      for (j = 0; j < 8; j++)
+	for (k = 0; k < 6; k++)
+	  if (g[i][j][k] != 3 * (i < N / 16 - 1 && (j & 1) == 0 && k >= 3))
+	    abort ();
+  }
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/examples-4/declare_target-1.c b/libgomp/testsuite/libgomp.c/examples-4/declare_target-1.c
index beca8555780..6d4bc4fac12 100644
--- a/libgomp/testsuite/libgomp.c/examples-4/declare_target-1.c
+++ b/libgomp/testsuite/libgomp.c/examples-4/declare_target-1.c
@@ -20,7 +20,7 @@ int fib_wrapper (int n)
 {
   int x = 0;
 
-  #pragma omp target if(n > THRESHOLD)
+  #pragma omp target if(n > THRESHOLD) map(from:x)
     x = fib (n);
 
   return x;
diff --git a/libgomp/testsuite/libgomp.c/examples-4/declare_target-4.c b/libgomp/testsuite/libgomp.c/examples-4/declare_target-4.c
index db70460b309..f2414366951 100644
--- a/libgomp/testsuite/libgomp.c/examples-4/declare_target-4.c
+++ b/libgomp/testsuite/libgomp.c/examples-4/declare_target-4.c
@@ -41,7 +41,7 @@ float accum (int k)
   int i;
   float tmp = 0.0;
 
-  #pragma omp target
+  #pragma omp target map(tofrom:tmp)
     #pragma omp parallel for reduction(+:tmp)
       for (i = 0; i < N; i++)
 	tmp += Pfun (i, k);
diff --git a/libgomp/testsuite/libgomp.c/examples-4/declare_target-5.c b/libgomp/testsuite/libgomp.c/examples-4/declare_target-5.c
index b550f1ff540..33d6137afd5 100644
--- a/libgomp/testsuite/libgomp.c/examples-4/declare_target-5.c
+++ b/libgomp/testsuite/libgomp.c/examples-4/declare_target-5.c
@@ -48,7 +48,7 @@ float accum ()
   int i, k;
   float tmp = 0.0;
 
-  #pragma omp target
+  #pragma omp target map(tofrom:tmp)
     #pragma omp parallel for reduction(+:tmp)
       for (i = 0; i < N; i++)
 	{
diff --git a/libgomp/testsuite/libgomp.c/examples-4/device-1.c b/libgomp/testsuite/libgomp.c/examples-4/device-1.c
index f7c84fb4c14..dad8572f8f0 100644
--- a/libgomp/testsuite/libgomp.c/examples-4/device-1.c
+++ b/libgomp/testsuite/libgomp.c/examples-4/device-1.c
@@ -10,11 +10,11 @@ int main ()
   int b = 0;
   int c, d;
 
-  #pragma omp target if(a > 200 && a < 400)
+  #pragma omp target if(a > 200 && a < 400) map(from: c)
     c = omp_is_initial_device ();
 
   #pragma omp target data map(to: b) if(a > 200 && a < 400)
-    #pragma omp target
+    #pragma omp target map(from: b, d)
       {
 	b = 100;
 	d = omp_is_initial_device ();
@@ -26,11 +26,11 @@ int main ()
   a += 200;
   b = 0;
 
-  #pragma omp target if(a > 200 && a < 400)
+  #pragma omp target if(a > 200 && a < 400) map(from: c)
     c = omp_is_initial_device ();
 
   #pragma omp target data map(to: b) if(a > 200 && a < 400)
-    #pragma omp target
+    #pragma omp target map(from: b, d)
       {
 	b = 100;
 	d = omp_is_initial_device ();
@@ -42,11 +42,11 @@ int main ()
   a += 200;
   b = 0;
 
-  #pragma omp target if(a > 200 && a < 400)
+  #pragma omp target if(a > 200 && a < 400) map(from: c)
     c = omp_is_initial_device ();
 
   #pragma omp target data map(to: b) if(a > 200 && a < 400)
-    #pragma omp target
+    #pragma omp target map(from: b, d)
       {
 	b = 100;
 	d = omp_is_initial_device ();
diff --git a/libgomp/testsuite/libgomp.c/examples-4/device-3.c b/libgomp/testsuite/libgomp.c/examples-4/device-3.c
index 8a0cf7c200d..af086533278 100644
--- a/libgomp/testsuite/libgomp.c/examples-4/device-3.c
+++ b/libgomp/testsuite/libgomp.c/examples-4/device-3.c
@@ -9,7 +9,7 @@ int main ()
   int res;
   int default_device = omp_get_default_device ();
 
-  #pragma omp target
+  #pragma omp target map(from: res)
     res = omp_is_initial_device ();
 
   if (res)
@@ -17,7 +17,7 @@ int main ()
 
   omp_set_default_device (omp_get_num_devices ());
 
-  #pragma omp target
+  #pragma omp target map(from: res)
     res = omp_is_initial_device ();
 
   if (!res)
diff --git a/libgomp/testsuite/libgomp.c/examples-4/target_data-3.c b/libgomp/testsuite/libgomp.c/examples-4/target_data-3.c
index abb283801f8..46b674013d0 100644
--- a/libgomp/testsuite/libgomp.c/examples-4/target_data-3.c
+++ b/libgomp/testsuite/libgomp.c/examples-4/target_data-3.c
@@ -47,7 +47,7 @@ void gramSchmidt (int Q[][COLS], const int rows, const int cols)
       {
 	int tmp = 0;
 
-	#pragma omp target
+	#pragma omp target map(tofrom:tmp)
 	  #pragma omp parallel for reduction(+:tmp)
 	    for (i = 0; i < rows; i++)
 	      tmp += (Q[i][k] * Q[i][k]);
diff --git a/libgomp/testsuite/libgomp.c/examples-4/teams-2.c b/libgomp/testsuite/libgomp.c/examples-4/teams-2.c
index 8bbbc355b17..7d0a60ebb51 100644
--- a/libgomp/testsuite/libgomp.c/examples-4/teams-2.c
+++ b/libgomp/testsuite/libgomp.c/examples-4/teams-2.c
@@ -32,7 +32,7 @@ float dotprod (float B[], float C[], int n, int block_size,
   int i, i0;
   float sum = 0;
 
-  #pragma omp target map(to: B[0:n], C[0:n])
+  #pragma omp target map(to: B[0:n], C[0:n]) map(tofrom: sum)
     #pragma omp teams num_teams(num_teams) thread_limit(block_threads) \
 		      reduction(+:sum)
       #pragma omp distribute
diff --git a/libgomp/testsuite/libgomp.c/examples-4/teams-3.c b/libgomp/testsuite/libgomp.c/examples-4/teams-3.c
index b6708785884..5fe63a68a4b 100644
--- a/libgomp/testsuite/libgomp.c/examples-4/teams-3.c
+++ b/libgomp/testsuite/libgomp.c/examples-4/teams-3.c
@@ -31,7 +31,7 @@ float dotprod (float B[], float C[], int n)
   int i;
   float sum = 0;
 
-  #pragma omp target teams map(to: B[0:n], C[0:n])
+  #pragma omp target teams map(to: B[0:n], C[0:n]) map(tofrom: sum)
     #pragma omp distribute parallel for reduction(+:sum)
       for (i = 0; i < n; i++)
 	sum += B[i] * C[i];
diff --git a/libgomp/testsuite/libgomp.c/examples-4/teams-4.c b/libgomp/testsuite/libgomp.c/examples-4/teams-4.c
index 9aef78ecfba..6136eabef66 100644
--- a/libgomp/testsuite/libgomp.c/examples-4/teams-4.c
+++ b/libgomp/testsuite/libgomp.c/examples-4/teams-4.c
@@ -31,7 +31,7 @@ float dotprod (float B[], float C[], int n)
   int i;
   float sum = 0;
 
-  #pragma omp target map(to: B[0:n], C[0:n])
+  #pragma omp target map(to: B[0:n], C[0:n]) map(tofrom:sum)
     #pragma omp teams num_teams(8) thread_limit(16)
       #pragma omp distribute parallel for reduction(+:sum) \
 					  dist_schedule(static, 1024) \
diff --git a/libgomp/testsuite/libgomp.c/for-2.h b/libgomp/testsuite/libgomp.c/for-2.h
index 920d23b5202..0bd116c5aec 100644
--- a/libgomp/testsuite/libgomp.c/for-2.h
+++ b/libgomp/testsuite/libgomp.c/for-2.h
@@ -11,11 +11,21 @@ noreturn (void)
 #ifndef SC
 #define SC
 #endif
+#ifndef OMPTGT
+#define OMPTGT
+#endif
+#ifndef OMPTO
+#define OMPTO(v) do {} while (0)
+#endif
+#ifndef OMPFROM
+#define OMPFROM(v) do {} while (0)
+#endif
 
 __attribute__((noinline, noclone)) void
 N(f0) (void)
 {
   int i;
+  OMPTGT
 #pragma omp F S
   for (i = 0; i < 1500; i++)
     a[i] += 2;
@@ -24,6 +34,7 @@ N(f0) (void)
 __attribute__((noinline, noclone)) void
 N(f1) (void)
 {
+  OMPTGT
 #pragma omp F S
   for (unsigned int i = __INT_MAX__; i < 3000U + __INT_MAX__; i += 2)
     a[(i - __INT_MAX__) >> 1] -= 2;
@@ -33,6 +44,7 @@ __attribute__((noinline, noclone)) void
 N(f2) (void)
 {
   unsigned long long i;
+  OMPTGT
 #pragma omp F S
   for (i = __LONG_LONG_MAX__ + 4500ULL - 27;
        i > __LONG_LONG_MAX__ - 27ULL; i -= 3)
@@ -42,6 +54,7 @@ N(f2) (void)
 __attribute__((noinline, noclone)) void
 N(f3) (long long n1, long long n2, long long s3)
 {
+  OMPTGT
 #pragma omp F S
   for (long long i = n1 + 23; i > n2 - 25; i -= s3)
     a[i + 48] += 7;
@@ -51,6 +64,7 @@ __attribute__((noinline, noclone)) void
 N(f4) (void)
 {
   unsigned int i;
+  OMPTGT
 #pragma omp F S
   for (i = 30; i < 20; i += 2)
     a[i] += 10;
@@ -61,6 +75,7 @@ N(f5) (int n11, int n12, int n21, int n22, int n31, int n32,
        int s1, int s2, int s3)
 {
   SC int v1, v2, v3;
+  OMPTGT
 #pragma omp F S collapse(3)
   for (v1 = n11; v1 < n12; v1 += s1)
     for (v2 = n21; v2 < n22; v2 += s2)
@@ -74,6 +89,7 @@ N(f6) (int n11, int n12, int n21, int n22, long long n31, long long n32,
 {
   SC int v1, v2;
   SC long long v3;
+  OMPTGT
 #pragma omp F S collapse(3)
   for (v1 = n11; v1 > n12; v1 += s1)
     for (v2 = n21; v2 > n22; v2 += s2)
@@ -86,6 +102,7 @@ N(f7) (void)
 {
   SC unsigned int v1, v3;
   SC unsigned long long v2;
+  OMPTGT
 #pragma omp F S collapse(3)
   for (v1 = 0; v1 < 20; v1 += 2)
     for (v2 = __LONG_LONG_MAX__ + 16ULL;
@@ -98,6 +115,7 @@ __attribute__((noinline, noclone)) void
 N(f8) (void)
 {
   SC long long v1, v2, v3;
+  OMPTGT
 #pragma omp F S collapse(3)
   for (v1 = 0; v1 < 20; v1 += 2)
     for (v2 = 30; v2 < 20; v2++)
@@ -109,6 +127,7 @@ __attribute__((noinline, noclone)) void
 N(f9) (void)
 {
   int i;
+  OMPTGT
 #pragma omp F S
   for (i = 20; i < 10; i++)
     {
@@ -122,6 +141,7 @@ __attribute__((noinline, noclone)) void
 N(f10) (void)
 {
   SC int i;
+  OMPTGT
 #pragma omp F S collapse(3)
   for (i = 0; i < 10; i++)
     for (int j = 10; j < 8; j++)
@@ -137,6 +157,7 @@ __attribute__((noinline, noclone)) void
 N(f11) (int n)
 {
   int i;
+  OMPTGT
 #pragma omp F S
   for (i = 20; i < n; i++)
     {
@@ -150,6 +171,7 @@ __attribute__((noinline, noclone)) void
 N(f12) (int n)
 {
   SC int i;
+  OMPTGT
 #pragma omp F S collapse(3)
   for (i = 0; i < 10; i++)
     for (int j = n; j < 8; j++)
@@ -165,6 +187,7 @@ __attribute__((noinline, noclone)) void
 N(f13) (void)
 {
   int *i;
+  OMPTGT
 #pragma omp F S
   for (i = a; i < &a[1500]; i++)
     i[0] += 2;
@@ -174,6 +197,7 @@ __attribute__((noinline, noclone)) void
 N(f14) (void)
 {
   SC float *i;
+  OMPTGT
 #pragma omp F S collapse(3)
   for (i = &b[0][0][0]; i < &b[0][0][10]; i++)
     for (float *j = &b[0][15][0]; j > &b[0][0][0]; j -= 10)
@@ -188,27 +212,34 @@ N(test) (void)
   int i, j, k;
   for (i = 0; i < 1500; i++)
     a[i] = i - 25;
+  OMPTO (a);
   N(f0) ();
+  OMPFROM (a);
   for (i = 0; i < 1500; i++)
     if (a[i] != i - 23)
       return 1;
   N(f1) ();
+  OMPFROM (a);
   for (i = 0; i < 1500; i++)
     if (a[i] != i - 25)
       return 1;
   N(f2) ();
+  OMPFROM (a);
   for (i = 0; i < 1500; i++)
     if (a[i] != i - 29)
       return 1;
   N(f3) (1500LL - 1 - 23 - 48, -1LL + 25 - 48, 1LL);
+  OMPFROM (a);
   for (i = 0; i < 1500; i++)
     if (a[i] != i - 22)
       return 1;
   N(f3) (1500LL - 1 - 23 - 48, 1500LL - 1, 7LL);
+  OMPFROM (a);
   for (i = 0; i < 1500; i++)
     if (a[i] != i - 22)
       return 1;
   N(f4) ();
+  OMPFROM (a);
   for (i = 0; i < 1500; i++)
     if (a[i] != i - 22)
       return 1;
@@ -216,31 +247,37 @@ N(test) (void)
     for (j = 0; j < 15; j++)
       for (k = 0; k < 10; k++)
 	b[i][j][k] = i - 2.5 + 1.5 * j - 1.5 * k;
+  OMPTO (b);
   N(f5) (0, 10, 0, 15, 0, 10, 1, 1, 1);
+  OMPFROM (b);
   for (i = 0; i < 10; i++)
     for (j = 0; j < 15; j++)
       for (k = 0; k < 10; k++)
 	if (b[i][j][k] != i + 1.5 * j - 1.5 * k)
 	  return 1;
   N(f5) (0, 10, 30, 15, 0, 10, 4, 5, 6);
+  OMPFROM (b);
   for (i = 0; i < 10; i++)
     for (j = 0; j < 15; j++)
       for (k = 0; k < 10; k++)
 	if (b[i][j][k] != i + 1.5 * j - 1.5 * k)
 	  return 1;
   N(f6) (9, -1, 29, 0, 9, -1, -1, -2, -1);
+  OMPFROM (b);
   for (i = 0; i < 10; i++)
     for (j = 0; j < 15; j++)
       for (k = 0; k < 10; k++)
 	if (b[i][j][k] != i - 4.5 + 1.5 * j - 1.5 * k)
 	  return 1;
   N(f7) ();
+  OMPFROM (b);
   for (i = 0; i < 10; i++)
     for (j = 0; j < 15; j++)
       for (k = 0; k < 10; k++)
 	if (b[i][j][k] != i + 1.0 + 1.5 * j - 1.5 * k)
 	  return 1;
   N(f8) ();	  
+  OMPFROM (b);
   for (i = 0; i < 10; i++)
     for (j = 0; j < 15; j++)
       for (k = 0; k < 10; k++)
@@ -250,6 +287,8 @@ N(test) (void)
   N(f10) ();
   N(f11) (10);
   N(f12) (12);
+  OMPFROM (a);
+  OMPFROM (b);
   for (i = 0; i < 1500; i++)
     if (a[i] != i - 22)
       return 1;
@@ -260,6 +299,8 @@ N(test) (void)
 	  return 1;
   N(f13) ();
   N(f14) ();
+  OMPFROM (a);
+  OMPFROM (b);
   for (i = 0; i < 1500; i++)
     if (a[i] != i - 20)
       return 1;
diff --git a/libgomp/testsuite/libgomp.c/for-4.c b/libgomp/testsuite/libgomp.c/for-4.c
new file mode 100644
index 00000000000..ef5465e1e76
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-4.c
@@ -0,0 +1,42 @@
+/* { dg-options "-std=gnu99 -fopenmp" } */
+
+extern void abort (void);
+
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#define F taskloop
+#define G taskloop
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F taskloop simd
+#define G taskloop_simd
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+int
+main ()
+{
+  int err = 0;
+  #pragma omp parallel reduction(|:err)
+    #pragma omp single
+      {
+	if (test_taskloop_normal ()
+	    || test_taskloop_simd_normal ())
+	  err = 1;
+      }
+  if (err)
+    abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/for-5.c b/libgomp/testsuite/libgomp.c/for-5.c
new file mode 100644
index 00000000000..84e636ab0f9
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-5.c
@@ -0,0 +1,154 @@
+/* { dg-additional-options "-std=gnu99" } */
+
+extern void abort ();
+
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#pragma omp declare target
+
+#define F for
+#define G f
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#pragma omp end declare target
+
+#undef OMPFROM
+#undef OMPTO
+#define DO_PRAGMA(x) _Pragma (#x)
+#define OMPFROM(v) DO_PRAGMA (omp target update from(v))
+#define OMPTO(v) DO_PRAGMA (omp target update to(v))
+
+#define F target parallel for
+#define G tpf
+#include "for-1.h"
+#undef F
+#undef G
+
+#define F target simd
+#define G t_simd
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F target parallel for simd
+#define G tpf_simd
+#include "for-1.h"
+#undef F
+#undef G
+
+#define F target teams distribute
+#define G ttd
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F target teams distribute
+#define G ttd_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F target teams distribute simd
+#define G ttds
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F target teams distribute simd
+#define G ttds_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F target teams distribute parallel for
+#define G ttdpf
+#include "for-1.h"
+#undef F
+#undef G
+
+#define F target teams distribute parallel for dist_schedule(static, 128)
+#define G ttdpf_ds128
+#include "for-1.h"
+#undef F
+#undef G
+
+#define F target teams distribute parallel for simd
+#define G ttdpfs
+#include "for-1.h"
+#undef F
+#undef G
+
+#define F target teams distribute parallel for simd dist_schedule(static, 128)
+#define G ttdpfs_ds128
+#include "for-1.h"
+#undef F
+#undef G
+
+int
+main ()
+{
+  if (test_tpf_static ()
+      || test_tpf_static32 ()
+      || test_tpf_auto ()
+      || test_tpf_guided32 ()
+      || test_tpf_runtime ()
+      || test_t_simd_normal ()
+      || test_tpf_simd_static ()
+      || test_tpf_simd_static32 ()
+      || test_tpf_simd_auto ()
+      || test_tpf_simd_guided32 ()
+      || test_tpf_simd_runtime ()
+      || test_ttd_normal ()
+      || test_ttd_ds128_normal ()
+      || test_ttds_normal ()
+      || test_ttds_ds128_normal ()
+      || test_ttdpf_static ()
+      || test_ttdpf_static32 ()
+      || test_ttdpf_auto ()
+      || test_ttdpf_guided32 ()
+      || test_ttdpf_runtime ()
+      || test_ttdpf_ds128_static ()
+      || test_ttdpf_ds128_static32 ()
+      || test_ttdpf_ds128_auto ()
+      || test_ttdpf_ds128_guided32 ()
+      || test_ttdpf_ds128_runtime ()
+      || test_ttdpfs_static ()
+      || test_ttdpfs_static32 ()
+      || test_ttdpfs_auto ()
+      || test_ttdpfs_guided32 ()
+      || test_ttdpfs_runtime ()
+      || test_ttdpfs_ds128_static ()
+      || test_ttdpfs_ds128_static32 ()
+      || test_ttdpfs_ds128_auto ()
+      || test_ttdpfs_ds128_guided32 ()
+      || test_ttdpfs_ds128_runtime ())
+    abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/for-6.c b/libgomp/testsuite/libgomp.c/for-6.c
new file mode 100644
index 00000000000..7f3c65e82b1
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/for-6.c
@@ -0,0 +1,123 @@
+/* { dg-additional-options "-std=gnu99" } */
+
+extern void abort ();
+
+#define M(x, y, z) O(x, y, z)
+#define O(x, y, z) x ## _ ## y ## _ ## z
+
+#pragma omp declare target
+
+#define F for
+#define G f
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#pragma omp end declare target
+
+#undef OMPTGT
+#undef OMPFROM
+#undef OMPTO
+#define DO_PRAGMA(x) _Pragma (#x)
+#define OMPTGT DO_PRAGMA (omp target)
+#define OMPFROM(v) DO_PRAGMA (omp target update from(v))
+#define OMPTO(v) DO_PRAGMA (omp target update to(v))
+
+#define F teams distribute
+#define G td
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F teams distribute
+#define G td_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F teams distribute simd
+#define G tds
+#define S
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F teams distribute simd
+#define G tds_ds128
+#define S dist_schedule(static, 128)
+#define N(x) M(x, G, normal)
+#include "for-2.h"
+#undef S
+#undef N
+#undef F
+#undef G
+
+#define F teams distribute parallel for
+#define G tdpf
+#include "for-1.h"
+#undef F
+#undef G
+
+#define F teams distribute parallel for dist_schedule(static, 128)
+#define G tdpf_ds128
+#include "for-1.h"
+#undef F
+#undef G
+
+#define F teams distribute parallel for simd
+#define G tdpfs
+#include "for-1.h"
+#undef F
+#undef G
+
+#define F teams distribute parallel for simd dist_schedule(static, 128)
+#define G tdpfs_ds128
+#include "for-1.h"
+#undef F
+#undef G
+
+int
+main ()
+{
+  if (test_td_normal ()
+      || test_td_ds128_normal ()
+      || test_tds_normal ()
+      || test_tds_ds128_normal ()
+      || test_tdpf_static ()
+      || test_tdpf_static32 ()
+      || test_tdpf_auto ()
+      || test_tdpf_guided32 ()
+      || test_tdpf_runtime ()
+      || test_tdpf_ds128_static ()
+      || test_tdpf_ds128_static32 ()
+      || test_tdpf_ds128_auto ()
+      || test_tdpf_ds128_guided32 ()
+      || test_tdpf_ds128_runtime ()
+      || test_tdpfs_static ()
+      || test_tdpfs_static32 ()
+      || test_tdpfs_auto ()
+      || test_tdpfs_guided32 ()
+      || test_tdpfs_runtime ()
+      || test_tdpfs_ds128_static ()
+      || test_tdpfs_ds128_static32 ()
+      || test_tdpfs_ds128_auto ()
+      || test_tdpfs_ds128_guided32 ()
+      || test_tdpfs_ds128_runtime ())
+    abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/linear-1.c b/libgomp/testsuite/libgomp.c/linear-1.c
new file mode 100644
index 00000000000..f86fb33c5da
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/linear-1.c
@@ -0,0 +1,250 @@
+int a[256];
+
+__attribute__((noinline, noclone)) int
+f1 (int i)
+{
+  #pragma omp parallel for linear (i: 4)
+  for (int j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) short int
+f2 (short int i, char k)
+{
+  #pragma omp parallel for linear (i: k + 1)
+  for (long j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) long long int
+f3 (long long int i, long long int k)
+{
+  #pragma omp parallel for linear (i: k)
+  for (short j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) int
+f4 (int i)
+{
+  #pragma omp parallel for linear (i: 4) schedule(static, 3)
+  for (int j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) short int
+f5 (short int i, char k)
+{
+  #pragma omp parallel for linear (i: k + 1) schedule(static, 5)
+  for (long j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) long long int
+f6 (long long int i, long long int k)
+{
+  #pragma omp parallel for linear (i: k) schedule(static, 7)
+  for (short j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) int
+f7 (int i)
+{
+  #pragma omp parallel for linear (i: 4) schedule(dynamic, 3)
+  for (int j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) short int
+f8 (short int i, char k)
+{
+  #pragma omp parallel for linear (i: k + 1) schedule(dynamic, 5)
+  for (long j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) long long int
+f9 (long long int i, long long int k)
+{
+  #pragma omp parallel for linear (i: k) schedule(dynamic, 7)
+  for (short j = 16; j < 64; j++)
+    {
+      a[i] = j;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) int
+f10 (int i, long step)
+{
+  #pragma omp parallel for linear (i: 4)
+  for (int j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) short int
+f11 (short int i, char k, char step)
+{
+  #pragma omp parallel for linear (i: k + 1)
+  for (long j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) long long int
+f12 (long long int i, long long int k, int step)
+{
+  #pragma omp parallel for linear (i: k)
+  for (short j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) int
+f13 (int i, long long int step)
+{
+  #pragma omp parallel for linear (i: 4) schedule(static, 3)
+  for (int j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) short int
+f14 (short int i, char k, int step)
+{
+  #pragma omp parallel for linear (i: k + 1) schedule(static, 5)
+  for (long j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) long long int
+f15 (long long int i, long long int k, long int step)
+{
+  #pragma omp parallel for linear (i: k) schedule(static, 7)
+  for (short j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) int
+f16 (int i, long long int step)
+{
+  #pragma omp parallel for linear (i: 4) schedule(dynamic, 3)
+  for (int j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) short int
+f17 (short int i, char k, int step)
+{
+  #pragma omp parallel for linear (i: k + 1) schedule(dynamic, 5)
+  for (long j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+__attribute__((noinline, noclone)) long long int
+f18 (long long int i, long long int k, long int step)
+{
+  #pragma omp parallel for linear (i: k) schedule(dynamic, 7)
+  for (short j = 16; j < 112; j += step)
+    {
+      a[i] = j / 2 + 8;
+      i += 4;
+    }
+  return i;
+}
+
+int
+main ()
+{
+#define TEST(x) \
+  if (x != 8 + 48 * 4)				\
+    __builtin_abort ();				\
+  for (int i = 0; i < 256; i++)			\
+    if (a[i] != (((i & 3) == 0 && i >= 8	\
+		  && i < 8 + 48 * 4)		\
+		 ? ((i - 8) / 4) + 16 : 0))	\
+      __builtin_abort ();			\
+  __builtin_memset (a, 0, sizeof (a))
+  TEST (f1 (8));
+  TEST (f2 (8, 3));
+  TEST (f3 (8LL, 4LL));
+  TEST (f4 (8));
+  TEST (f5 (8, 3));
+  TEST (f6 (8LL, 4LL));
+  TEST (f7 (8));
+  TEST (f8 (8, 3));
+  TEST (f9 (8LL, 4LL));
+  TEST (f10 (8, 2));
+  TEST (f11 (8, 3, 2));
+  TEST (f12 (8LL, 4LL, 2));
+  TEST (f13 (8, 2));
+  TEST (f14 (8, 3, 2));
+  TEST (f15 (8LL, 4LL, 2));
+  TEST (f16 (8, 2));
+  TEST (f17 (8, 3, 2));
+  TEST (f18 (8LL, 4LL, 2));
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/ordered-4.c b/libgomp/testsuite/libgomp.c/ordered-4.c
new file mode 100644
index 00000000000..8412d4715c3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/ordered-4.c
@@ -0,0 +1,83 @@
+extern
+#ifdef __cplusplus
+"C"
+#endif
+void abort (void);
+
+void
+foo (int i, char *j)
+{
+  #pragma omp atomic
+  j[i]++;
+  #pragma omp ordered threads
+  {
+    int t;
+    #pragma omp atomic read
+    t = j[i];
+    if (t != 3)
+      abort ();
+    if (i > 1)
+      {
+	#pragma omp atomic read
+	t = j[i - 1];
+	if (t == 2)
+	  abort ();
+      }
+    if (i < 127)
+      {
+	#pragma omp atomic read
+	t = j[i + 1];
+	if (t == 4)
+	  abort ();
+      }
+  }
+  #pragma omp atomic
+  j[i]++;
+}
+
+int
+main ()
+{
+  int i;
+  char j[128];
+  #pragma omp parallel
+  {
+    #pragma omp for
+    for (i = 0; i < 128; i++)
+      j[i] = 0;
+    #pragma omp for ordered schedule(dynamic, 1)
+    for (i = 0; i < 128; i++)
+      {
+	#pragma omp atomic
+	j[i]++;
+	#pragma omp ordered threads
+	{
+	  int t;
+	  #pragma omp atomic read
+	  t = j[i];
+	  if (t != 1)
+	    abort ();
+	  if (i > 1)
+	    {
+	      #pragma omp atomic read
+	      t = j[i - 1];
+	      if (t == 0)
+		abort ();
+	    }
+	  if (i < 127)
+	    {
+	      #pragma omp atomic read
+	      t = j[i + 1];
+	      if (t == 2)
+		abort ();
+	    }
+	}
+	#pragma omp atomic
+	j[i]++;
+      }
+    #pragma omp for ordered schedule(static, 1)
+    for (i = 0; i < 128; i++)
+      foo (i, j);
+  }
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/pr66199-2.c b/libgomp/testsuite/libgomp.c/pr66199-2.c
index ddb79de8943..2fc9eec529a 100644
--- a/libgomp/testsuite/libgomp.c/pr66199-2.c
+++ b/libgomp/testsuite/libgomp.c/pr66199-2.c
@@ -18,12 +18,11 @@ __attribute__((noinline, noclone)) void
 f2 (long a, long b, long c)
 {
   long d, e;
-  #pragma omp target teams distribute parallel for simd default(none) firstprivate (a, b) shared(u, v, w) linear(d) linear(c:5) lastprivate(e)
+  #pragma omp target teams distribute parallel for simd default(none) firstprivate (a, b, c) shared(u, v, w) linear(d) lastprivate(e)
   for (d = a; d < b; d++)
     {
       u[d] = v[d] + w[d];
-      c += 5;
-      e = c;
+      e = c + d * 5;
     }
 }
 
diff --git a/libgomp/testsuite/libgomp.c/pr66199-3.c b/libgomp/testsuite/libgomp.c/pr66199-3.c
new file mode 100644
index 00000000000..fe0ccb47197
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/pr66199-3.c
@@ -0,0 +1,50 @@
+/* PR middle-end/66199 */
+/* { dg-do run } */
+/* { dg-options "-O2 -fopenmp" } */
+
+int u[1024], v[1024], w[1024];
+
+__attribute__((noinline, noclone)) long
+f1 (long a, long b)
+{
+  long d;
+  #pragma omp parallel for lastprivate (d) default(none) firstprivate (a, b) shared(u, v, w)
+  for (d = a; d < b; d++)
+    u[d] = v[d] + w[d];
+  return d;
+}
+
+__attribute__((noinline, noclone)) long
+f2 (long a, long b, long c)
+{
+  long d, e;
+  #pragma omp parallel for lastprivate (d) default(none) firstprivate (a, b) shared(u, v, w) linear(c:5) lastprivate(e)
+  for (d = a; d < b; d++)
+    {
+      u[d] = v[d] + w[d];
+      c += 5;
+      e = c;
+    }
+  return d + c + e;
+}
+
+__attribute__((noinline, noclone)) long
+f3 (long a1, long b1, long a2, long b2)
+{
+  long d1, d2;
+  #pragma omp parallel for default(none) firstprivate (a1, b1, a2, b2) shared(u, v, w) lastprivate(d1, d2) collapse(2)
+  for (d1 = a1; d1 < b1; d1++)
+    for (d2 = a2; d2 < b2; d2++)
+      u[d1 * 32 + d2] = v[d1 * 32 + d2] + w[d1 * 32 + d2];
+  return d1 + d2;
+}
+
+int
+main ()
+{
+  if (f1 (0, 1024) != 1024
+      || f2 (0, 1024, 17) != 1024 + 2 * (17 + 5 * 1024)
+      || f3 (0, 32, 0, 32) != 64)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/pr66199-4.c b/libgomp/testsuite/libgomp.c/pr66199-4.c
new file mode 100644
index 00000000000..a9b1bb8a59e
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/pr66199-4.c
@@ -0,0 +1,58 @@
+/* PR middle-end/66199 */
+/* { dg-do run } */
+/* { dg-options "-O2 -fopenmp" } */
+
+#pragma omp declare target
+int u[1024], v[1024], w[1024];
+#pragma omp end declare target
+
+__attribute__((noinline, noclone)) void
+f1 (long a, long b)
+{
+  long d;
+  #pragma omp target teams distribute parallel for default(none) firstprivate (a, b) shared(u, v, w)
+  for (d = a; d < b; d++)
+    u[d] = v[d] + w[d];
+}
+
+__attribute__((noinline, noclone)) void
+f2 (long a, long b, long c)
+{
+  long d, e;
+  #pragma omp target teams distribute parallel for default(none) firstprivate (a, b, c) shared(u, v, w) lastprivate(d, e)
+  for (d = a; d < b; d++)
+    {
+      u[d] = v[d] + w[d];
+      e = c + d * 5;
+    }
+}
+
+__attribute__((noinline, noclone)) void
+f3 (long a1, long b1, long a2, long b2)
+{
+  long d1, d2;
+  #pragma omp target teams distribute parallel for default(none) firstprivate (a1, b1, a2, b2) shared(u, v, w) lastprivate(d1, d2) collapse(2)
+  for (d1 = a1; d1 < b1; d1++)
+    for (d2 = a2; d2 < b2; d2++)
+      u[d1 * 32 + d2] = v[d1 * 32 + d2] + w[d1 * 32 + d2];
+}
+
+__attribute__((noinline, noclone)) void
+f4 (long a1, long b1, long a2, long b2)
+{
+  long d1, d2;
+  #pragma omp target teams distribute parallel for default(none) firstprivate (a1, b1, a2, b2) shared(u, v, w) collapse(2)
+  for (d1 = a1; d1 < b1; d1++)
+    for (d2 = a2; d2 < b2; d2++)
+      u[d1 * 32 + d2] = v[d1 * 32 + d2] + w[d1 * 32 + d2];
+}
+
+int
+main ()
+{
+  f1 (0, 1024);
+  f2 (0, 1024, 17);
+  f3 (0, 32, 0, 32);
+  f4 (0, 32, 0, 32);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/reduction-10.c b/libgomp/testsuite/libgomp.c/reduction-10.c
new file mode 100644
index 00000000000..3c95ebd4a4b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/reduction-10.c
@@ -0,0 +1,105 @@
+struct A { int t; };
+struct B { char t; };
+struct C { unsigned long long t; };
+struct D { long t; };
+void
+add (struct B *x, struct B *y)
+{
+  x->t += y->t;
+}
+void
+zero (struct B *x)
+{
+  x->t = 0;
+}
+void
+orit (struct C *x, struct C *y)
+{
+  y->t |= x->t;
+}
+#pragma omp declare reduction(+:struct A:omp_out.t += omp_in.t)
+#pragma omp declare reduction(+:struct B:add (&omp_out, &omp_in)) initializer(zero (&omp_priv))
+#pragma omp declare reduction(*:struct A:omp_out.t *= omp_in.t) initializer(omp_priv = { 1 })
+#pragma omp declare reduction(|:struct C:orit (&omp_in, &omp_out))
+#pragma omp declare reduction(&:struct D:omp_out.t = omp_out.t & omp_in.t) initializer(omp_priv = { ~0L })
+#pragma omp declare reduction(maxb:short:omp_out = omp_in > omp_out ? omp_in : omp_out) initializer(omp_priv = -6)
+
+struct B z[10];
+
+__attribute__((noinline, noclone)) void
+foo (struct A (*x)[3][2], struct A *y, struct D w[1][2], int p1, long p2, long p3, int p4,
+     int p5, long p6, short p7)
+{
+  struct C a[p7 + 4];
+  short b[p7];
+  int i;
+  for (i = 0; i < p7 + 4; i++)
+    {
+      if (i < p7)
+	b[i] = -6;
+      a[i].t = 0;
+    }
+  #pragma omp parallel for reduction(+:x[0:p1 + 1][:p2], z[:p3]) \
+			   reduction(*:y[:p4]) reduction(|:a[:p5]) \
+			   reduction(&:w[0:p6 - 1][:p6]) reduction(maxb:b)
+  for (i = 0; i < 128; i++)
+    {
+      x[i / 64][i % 3][(i / 4) & 1].t += i;
+      if ((i & 15) == 1)
+	y[0].t *= 3;
+      if ((i & 31) == 2)
+	y[1].t *= 7;
+      if ((i & 63) == 3)
+	y[2].t *= 17;
+      z[i / 32].t += (i & 3);
+      if (i < 4)
+	z[i].t += i;
+      a[i / 32].t |= 1ULL << (i & 30);
+      w[0][i & 1].t &= ~(1L << (i / 17 * 3));
+      if ((i % 79) > b[0])
+	b[0] = i % 79;
+      if ((i % 13) > b[1])
+	b[1] = i % 13;
+      if ((i % 23) > b[2])
+	b[2] = i % 23;
+      if ((i % 85) > b[3])
+	b[3] = i % 85;
+      if ((i % 192) > b[4])
+	b[4] = i % 192;
+    }
+  for (i = 0; i < 9; i++)
+    if (a[i].t != (i < 4 ? 0x55555555ULL : 0))
+      __builtin_abort ();
+  if (b[0] != 78 || b[1] != 12 || b[2] != 22 || b[3] != 84 || b[4] != 127)
+    __builtin_abort ();
+}
+
+int
+main ()
+{
+  struct A a[4][3][2] = {};
+  static int a2[4][3][2] = {{{ 0, 0 }, { 0, 0 }, { 0, 0 }},
+			    {{ 312, 381 }, { 295, 356 }, { 337, 335 }},
+			    {{ 1041, 975 }, { 1016, 1085 }, { 935, 1060 }},
+			    {{ 0, 0 }, { 0, 0 }, { 0, 0 }}};
+  struct A y[5] = { { 0 }, { 1 }, { 1 }, { 1 }, { 0 } };
+  int y2[5] = { 0, 6561, 2401, 289, 0 };
+  char z2[10] = { 48, 49, 50, 51, 0, 0, 0, 0, 0, 0 };
+  struct D w[1][2] = { { { ~0L }, { ~0L } } };
+  foo (&a[1], y + 1, w, 1, 3L, 4L, 3, 4, 2L, 5);
+  int i, j, k;
+  for (i = 0; i < 4; i++)
+    for (j = 0; j < 3; j++)
+      for (k = 0; k < 2; k++)
+	if (a[i][j][k].t != a2[i][j][k])
+	  __builtin_abort ();
+  for (i = 0; i < 5; i++)
+    if (y[i].t != y2[i])
+      __builtin_abort ();
+  for (i = 0; i < 10; i++)
+    if (z[i].t != z2[i])
+      __builtin_abort ();
+  if (w[0][0].t != ~0x249249L || w[0][1].t != ~0x249249L)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/reduction-7.c b/libgomp/testsuite/libgomp.c/reduction-7.c
new file mode 100644
index 00000000000..347c26f46d3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/reduction-7.c
@@ -0,0 +1,64 @@
+char z[10] = { 0 };
+
+__attribute__((noinline, noclone)) void
+foo (int (*x)[3][2], int *y, long w[1][2])
+{
+  unsigned long long a[9] = {};
+  short b[5] = {};
+  int i;
+  #pragma omp parallel for reduction(+:x[0:2][:][0:2], z[:4]) \
+			   reduction(*:y[:3]) reduction(|:a[:4]) \
+			   reduction(&:w[0:1][:2]) reduction(max:b)
+  for (i = 0; i < 128; i++)
+    {
+      x[i / 64][i % 3][(i / 4) & 1] += i;
+      if ((i & 15) == 1)
+	y[0] *= 3;
+      if ((i & 31) == 2)
+	y[1] *= 7;
+      if ((i & 63) == 3)
+	y[2] *= 17;
+      z[i / 32] += (i & 3);
+      if (i < 4)
+	z[i] += i;
+      a[i / 32] |= 1ULL << (i & 30);
+      w[0][i & 1] &= ~(1L << (i / 17 * 3));
+      if ((i % 79) > b[0])
+	b[0] = i % 79;
+      if ((i % 13) > b[1])
+	b[1] = i % 13;
+      if ((i % 23) > b[2])
+	b[2] = i % 23;
+      if ((i % 85) > b[3])
+	b[3] = i % 85;
+      if ((i % 192) > b[4])
+	b[4] = i % 192;
+    }
+  for (i = 0; i < 9; i++)
+    if (a[i] != (i < 4 ? 0x55555555ULL : 0))
+      __builtin_abort ();
+  if (b[0] != 78 || b[1] != 12 || b[2] != 22 || b[3] != 84 || b[4] != 127)
+    __builtin_abort ();
+}
+
+int
+main ()
+{
+  int a[4][3][2] = {};
+  static int a2[4][3][2] = {{{ 0, 0 }, { 0, 0 }, { 0, 0 }},
+			    {{ 312, 381 }, { 295, 356 }, { 337, 335 }},
+			    {{ 1041, 975 }, { 1016, 1085 }, { 935, 1060 }},
+			    {{ 0, 0 }, { 0, 0 }, { 0, 0 }}};
+  int y[5] = { 0, 1, 1, 1, 0 };
+  int y2[5] = { 0, 6561, 2401, 289, 0 };
+  char z2[10] = { 48, 49, 50, 51, 0, 0, 0, 0, 0, 0 };
+  long w[1][2] = { ~0L, ~0L };
+  foo (&a[1], y + 1, w);
+  if (__builtin_memcmp (a, a2, sizeof (a))
+      || __builtin_memcmp (y, y2, sizeof (y))
+      || __builtin_memcmp (z, z2, sizeof (z))
+      || w[0][0] != ~0x249249L
+      || w[0][1] != ~0x249249L)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/reduction-8.c b/libgomp/testsuite/libgomp.c/reduction-8.c
new file mode 100644
index 00000000000..f4ec03aabea
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/reduction-8.c
@@ -0,0 +1,98 @@
+struct A { int t; };
+struct B { char t; };
+struct C { unsigned long long t; };
+struct D { long t; };
+void
+add (struct B *x, struct B *y)
+{
+  x->t += y->t;
+}
+void
+zero (struct B *x)
+{
+  x->t = 0;
+}
+void
+orit (struct C *x, struct C *y)
+{
+  y->t |= x->t;
+}
+#pragma omp declare reduction(+:struct A:omp_out.t += omp_in.t)
+#pragma omp declare reduction(+:struct B:add (&omp_out, &omp_in)) initializer(zero (&omp_priv))
+#pragma omp declare reduction(*:struct A:omp_out.t *= omp_in.t) initializer(omp_priv = { 1 })
+#pragma omp declare reduction(|:struct C:orit (&omp_in, &omp_out))
+#pragma omp declare reduction(&:struct D:omp_out.t = omp_out.t & omp_in.t) initializer(omp_priv = { ~0L })
+#pragma omp declare reduction(maxb:short:omp_out = omp_in > omp_out ? omp_in : omp_out) initializer(omp_priv = -6)
+
+struct B z[10];
+
+__attribute__((noinline, noclone)) void
+foo (struct A (*x)[3][2], struct A *y, struct D w[1][2])
+{
+  struct C a[9] = {};
+  short b[5] = {};
+  int i;
+  #pragma omp parallel for reduction(+:x[0:2][:][0:2], z[:4]) \
+			   reduction(*:y[:3]) reduction(|:a[:4]) \
+			   reduction(&:w[0:1][:2]) reduction(maxb:b)
+  for (i = 0; i < 128; i++)
+    {
+      x[i / 64][i % 3][(i / 4) & 1].t += i;
+      if ((i & 15) == 1)
+	y[0].t *= 3;
+      if ((i & 31) == 2)
+	y[1].t *= 7;
+      if ((i & 63) == 3)
+	y[2].t *= 17;
+      z[i / 32].t += (i & 3);
+      if (i < 4)
+	z[i].t += i;
+      a[i / 32].t |= 1ULL << (i & 30);
+      w[0][i & 1].t &= ~(1L << (i / 17 * 3));
+      if ((i % 79) > b[0])
+	b[0] = i % 79;
+      if ((i % 13) > b[1])
+	b[1] = i % 13;
+      if ((i % 23) > b[2])
+	b[2] = i % 23;
+      if ((i % 85) > b[3])
+	b[3] = i % 85;
+      if ((i % 192) > b[4])
+	b[4] = i % 192;
+    }
+  for (i = 0; i < 9; i++)
+    if (a[i].t != (i < 4 ? 0x55555555ULL : 0))
+      __builtin_abort ();
+  if (b[0] != 78 || b[1] != 12 || b[2] != 22 || b[3] != 84 || b[4] != 127)
+    __builtin_abort ();
+}
+
+int
+main ()
+{
+  struct A a[4][3][2] = {};
+  static int a2[4][3][2] = {{{ 0, 0 }, { 0, 0 }, { 0, 0 }},
+			    {{ 312, 381 }, { 295, 356 }, { 337, 335 }},
+			    {{ 1041, 975 }, { 1016, 1085 }, { 935, 1060 }},
+			    {{ 0, 0 }, { 0, 0 }, { 0, 0 }}};
+  struct A y[5] = { { 0 }, { 1 }, { 1 }, { 1 }, { 0 } };
+  int y2[5] = { 0, 6561, 2401, 289, 0 };
+  char z2[10] = { 48, 49, 50, 51, 0, 0, 0, 0, 0, 0 };
+  struct D w[1][2] = { { { ~0L }, { ~0L } } };
+  foo (&a[1], y + 1, w);
+  int i, j, k;
+  for (i = 0; i < 4; i++)
+    for (j = 0; j < 3; j++)
+      for (k = 0; k < 2; k++)
+	if (a[i][j][k].t != a2[i][j][k])
+	  __builtin_abort ();
+  for (i = 0; i < 5; i++)
+    if (y[i].t != y2[i])
+      __builtin_abort ();
+  for (i = 0; i < 10; i++)
+    if (z[i].t != z2[i])
+      __builtin_abort ();
+  if (w[0][0].t != ~0x249249L || w[0][1].t != ~0x249249L)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/reduction-9.c b/libgomp/testsuite/libgomp.c/reduction-9.c
new file mode 100644
index 00000000000..13605c1ab88
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/reduction-9.c
@@ -0,0 +1,71 @@
+char z[10] = { 0 };
+
+__attribute__((noinline, noclone)) void
+foo (int (*x)[3][2], int *y, long w[1][2], int p1, long p2, long p3, int p4,
+     int p5, long p6, short p7)
+{
+  unsigned long long a[p7 + 4];
+  short b[p7];
+  int i;
+  for (i = 0; i < p7 + 4; i++)
+    {
+      if (i < p7)
+	b[i] = -6;
+      a[i] = 0;
+    }
+  #pragma omp parallel for reduction(+:x[0:p1 + 1][:p2], z[:p3]) \
+			   reduction(*:y[:p4]) reduction(|:a[:p5]) \
+			   reduction(&:w[0:p6 - 1][:p6]) reduction(max:b)
+  for (i = 0; i < 128; i++)
+    {
+      x[i / 64][i % 3][(i / 4) & 1] += i;
+      if ((i & 15) == 1)
+	y[0] *= 3;
+      if ((i & 31) == 2)
+	y[1] *= 7;
+      if ((i & 63) == 3)
+	y[2] *= 17;
+      z[i / 32] += (i & 3);
+      if (i < 4)
+	z[i] += i;
+      a[i / 32] |= 1ULL << (i & 30);
+      w[0][i & 1] &= ~(1L << (i / 17 * 3));
+      if ((i % 79) > b[0])
+	b[0] = i % 79;
+      if ((i % 13) > b[1])
+	b[1] = i % 13;
+      if ((i % 23) > b[2])
+	b[2] = i % 23;
+      if ((i % 85) > b[3])
+	b[3] = i % 85;
+      if ((i % 192) > b[4])
+	b[4] = i % 192;
+    }
+  for (i = 0; i < 9; i++)
+    if (a[i] != (i < 4 ? 0x55555555ULL : 0))
+      __builtin_abort ();
+  if (b[0] != 78 || b[1] != 12 || b[2] != 22 || b[3] != 84 || b[4] != 127)
+    __builtin_abort ();
+}
+
+int
+main ()
+{
+  int a[4][3][2] = {};
+  static int a2[4][3][2] = {{{ 0, 0 }, { 0, 0 }, { 0, 0 }},
+			    {{ 312, 381 }, { 295, 356 }, { 337, 335 }},
+			    {{ 1041, 975 }, { 1016, 1085 }, { 935, 1060 }},
+			    {{ 0, 0 }, { 0, 0 }, { 0, 0 }}};
+  int y[5] = { 0, 1, 1, 1, 0 };
+  int y2[5] = { 0, 6561, 2401, 289, 0 };
+  char z2[10] = { 48, 49, 50, 51, 0, 0, 0, 0, 0, 0 };
+  long w[1][2] = { ~0L, ~0L };
+  foo (&a[1], y + 1, w, 1, 3L, 4L, 3, 4, 2L, 5);
+  if (__builtin_memcmp (a, a2, sizeof (a))
+      || __builtin_memcmp (y, y2, sizeof (y))
+      || __builtin_memcmp (z, z2, sizeof (z))
+      || w[0][0] != ~0x249249L
+      || w[0][1] != ~0x249249L)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-1.c b/libgomp/testsuite/libgomp.c/target-1.c
index f734d3c279d..c7abb008182 100644
--- a/libgomp/testsuite/libgomp.c/target-1.c
+++ b/libgomp/testsuite/libgomp.c/target-1.c
@@ -34,7 +34,7 @@ fn2 (int x, int y, int z)
   fn1 (b, c, x);
   #pragma omp target data map(to: b)
   {
-    #pragma omp target map(tofrom: c)
+    #pragma omp target map(tofrom: c, s)
       #pragma omp teams num_teams(y) thread_limit(z) reduction(+:s) firstprivate(x)
 	#pragma omp distribute dist_schedule(static, 4) collapse(1)
 	  for (j=0; j < x; j += y)
@@ -52,7 +52,7 @@ fn3 (int x)
   double b[1024], c[1024], s = 0;
   int i;
   fn1 (b, c, x);
-  #pragma omp target map(to: b, c)
+  #pragma omp target map(to: b, c) map(tofrom:s)
     #pragma omp parallel for reduction(+:s)
       for (i = 0; i < x; i++)
 	tgt (), s += b[i] * c[i];
@@ -66,7 +66,8 @@ fn4 (int x, double *p)
   int i;
   fn1 (b, c, x);
   fn1 (d + x, p + x, x);
-  #pragma omp target map(to: b, c[0:x], d[x:x]) map(to:p[x:64 + (x & 31)])
+  #pragma omp target map(to: b, c[0:x], d[x:x]) map(to:p[x:64 + (x & 31)]) \
+		     map(tofrom: s)
     #pragma omp parallel for reduction(+:s)
       for (i = 0; i < x; i++)
 	s += b[i] * c[i] + d[x + i] + p[x + i];
diff --git a/libgomp/testsuite/libgomp.c/target-11.c b/libgomp/testsuite/libgomp.c/target-11.c
new file mode 100644
index 00000000000..625c2863f4b
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-11.c
@@ -0,0 +1,86 @@
+/* { dg-require-effective-target offload_device_nonshared_as } */
+
+#include <stdlib.h>
+#include <assert.h>
+
+#define N 32
+
+void test_array_section (int *p)
+{
+  #pragma omp target data map(alloc: p[0:N])
+    {
+      int ok = 1;
+      for (int i = 10; i < 10 + 4; i++)
+	p[i] = 997 * i;
+
+      #pragma omp target map(always to:p[10:4]) map(tofrom: ok)
+	for (int i = 10; i < 10 + 4; i++)
+	  if (p[i] != 997 * i)
+	    ok = 0;
+
+      assert (ok);
+
+      #pragma omp target map(always from:p[7:9])
+	for (int i = 0; i < N; i++)
+	  p[i] = i;
+    }
+}
+
+int main ()
+{
+  int aa = 0, bb = 0, cc = 0, dd = 0;
+
+  #pragma omp target data map(tofrom: aa) map(to: bb) map(from: cc, dd)
+    {
+      int ok;
+      aa = bb = cc = 1;
+
+      /* Set dd on target to 0 for the further check.  */
+      #pragma omp target map(always to: dd)
+	;
+
+      dd = 1;
+      #pragma omp target map(tofrom: aa) map(always to: bb) \
+	map(always from: cc) map(to: dd) map(from: ok)
+	{
+	  /* bb is always to, aa and dd are not.  */
+	  ok = (aa == 0) && (bb == 1) && (dd == 0);
+	  aa = bb = cc = dd = 2;
+	}
+
+      assert (ok);
+      assert (aa == 1);
+      assert (bb == 1);
+      assert (cc == 2); /* cc is always from.  */
+      assert (dd == 1);
+
+      dd = 3;
+      #pragma omp target map(from: cc) map(always to: dd) map(from: ok)
+	{
+	  ok = (dd == 3); /* dd is always to.  */
+	  cc = dd = 4;
+	}
+
+      assert (ok);
+      assert (cc == 2);
+      assert (dd == 3);
+    }
+
+  assert (aa == 2);
+  assert (bb == 1);
+  assert (cc == 4);
+  assert (dd == 4);
+
+  int *array = calloc (N, sizeof (int));
+  test_array_section (array);
+
+  for (int i = 0; i < 7; i++)
+    assert (array[i] == 0);
+  for (int i = 7; i < 7 + 9; i++)
+    assert (array[i] == i);
+  for (int i = 7 + 9; i < N; i++)
+    assert (array[i] == 0);
+
+  free (array);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-12.c b/libgomp/testsuite/libgomp.c/target-12.c
new file mode 100644
index 00000000000..e6b009463ad
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-12.c
@@ -0,0 +1,130 @@
+#include <omp.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  int d = omp_get_default_device ();
+  int id = omp_get_initial_device ();
+  int err;
+  int q[128], i;
+  void *p;
+
+  if (d < 0 || d >= omp_get_num_devices ())
+    d = id;
+
+  for (i = 0; i < 128; i++)
+    q[i] = i;
+
+  p = omp_target_alloc (130 * sizeof (int), d);
+  if (p == NULL)
+    return 0;
+
+  if (omp_target_memcpy_rect (NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, NULL,
+			      d, id) < 3
+      || omp_target_memcpy_rect (NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+				 NULL, id, d) < 3
+      || omp_target_memcpy_rect (NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+				 NULL, id, id) < 3)
+    abort ();
+
+  if (omp_target_associate_ptr (q, p, 128 * sizeof (int), sizeof (int), d) == 0)
+    {
+      size_t volume[3] = { 128, 0, 0 };
+      size_t dst_offsets[3] = { 0, 0, 0 };
+      size_t src_offsets[3] = { 1, 0, 0 };
+      size_t dst_dimensions[3] = { 128, 0, 0 };
+      size_t src_dimensions[3] = { 128, 0, 0 };
+
+      if (omp_target_associate_ptr (q, p, 128 * sizeof (int), sizeof (int), d) != 0)
+	abort ();
+
+      if (omp_target_is_present (q, d) != 1
+	  || omp_target_is_present (&q[32], d) != 1
+	  || omp_target_is_present (&q[128], d) != 1)
+	abort ();
+
+      if (omp_target_memcpy (p, q, 128 * sizeof (int), sizeof (int), 0,
+			     d, id) != 0)
+	abort ();
+
+      #pragma omp target if (d >= 0) device (d >= 0 ? d : 0) map(alloc:q[0:32]) map(from:err)
+      {
+	int j;
+	err = 0;
+	for (j = 0; j < 128; j++)
+	  if (q[j] != j)
+	    err = 1;
+	  else
+	    q[j] += 4;
+      }
+
+      if (err)
+	abort ();
+
+      if (omp_target_memcpy_rect (q, p, sizeof (int), 1, volume,
+				  dst_offsets, src_offsets, dst_dimensions,
+				  src_dimensions, id, d) != 0)
+	abort ();
+
+      for (i = 0; i < 128; i++)
+	if (q[i] != i + 4)
+	  abort ();
+
+      volume[2] = 2;
+      volume[1] = 3;
+      volume[0] = 6;
+      dst_offsets[2] = 1;
+      dst_offsets[1] = 0;
+      dst_offsets[0] = 0;
+      src_offsets[2] = 1;
+      src_offsets[1] = 0;
+      src_offsets[0] = 3;
+      dst_dimensions[2] = 2;
+      dst_dimensions[1] = 3;
+      dst_dimensions[0] = 6;
+      src_dimensions[2] = 3;
+      src_dimensions[1] = 4;
+      src_dimensions[0] = 6;
+      if (omp_target_memcpy_rect (p, q, sizeof (int), 3, volume,
+				  dst_offsets, src_offsets, dst_dimensions,
+				  src_dimensions, d, id) != 0)
+	abort ();
+
+      #pragma omp target if (d >= 0) device (d >= 0 ? d : 0) map(alloc:q[0:32]) map(from:err)
+      {
+	int j, k, l;
+	err = 0;
+	for (j = 0; j < 6; j++)
+	  for (k = 0; k < 3; k++)
+	    for (l = 0; l < 2; l++)
+	      if (q[j * 6 + k * 2 + l] != 3 * 12 + 4 + 1 + l + k * 3 + j * 12)
+		err = 1;
+      }
+
+      if (err)
+	abort ();
+
+      if (omp_target_memcpy (p, p, 10 * sizeof (int), 51 * sizeof (int),
+			     111 * sizeof (int), d, d) != 0)
+	abort ();
+
+      #pragma omp target if (d >= 0) device (d >= 0 ? d : 0) map(alloc:q[0:32]) map(from:err)
+	{
+	  int j;
+	  err = 0;
+	  for (j = 0; j < 10; j++)
+	    if (q[50 + j] != q[110 + j])
+	      err = 1;
+	}
+
+      if (err)
+	abort ();
+
+      if (omp_target_disassociate_ptr (q, d) != 0)
+	abort ();
+    }
+
+  omp_target_free (p, d);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-13.c b/libgomp/testsuite/libgomp.c/target-13.c
new file mode 100644
index 00000000000..168850b507c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-13.c
@@ -0,0 +1,45 @@
+#ifdef __cplusplus
+extern "C"
+#else
+extern
+#endif
+void abort (void);
+struct S { int s, t; };
+
+void
+foo ()
+{
+  int x = 5, y = 6, err = 0;
+  struct S u = { 7, 8 }, v = { 9, 10 };
+  double s = 11.5, t = 12.5;
+  #pragma omp target private (x, u, s) firstprivate (y, v, t) map(from:err)
+  {
+    x = y;
+    u = v;
+    s = t;
+    err = (x != 6 || y != 6
+	   || u.s != 9 || u.t != 10 || v.s != 9 || v.t != 10
+	   || s != 12.5 || t != 12.5);
+    x += 1;
+    y += 2;
+    u.s += 3;
+    v.t += 4;
+    s += 2.5;
+    t += 3.0;
+    if (x != 7 || y != 8
+	|| u.s != 12 || u.t != 10 || v.s != 9 || v.t != 14
+	|| s != 15.0 || t != 15.5)
+      err = 1;
+  }
+  if (err || x != 5 || y != 6
+      || u.s != 7 || u.t != 8 || v.s != 9 || v.t != 10
+      || s != 11.5 || t != 12.5)
+    abort ();
+}
+
+int
+main ()
+{
+  foo ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-14.c b/libgomp/testsuite/libgomp.c/target-14.c
new file mode 100644
index 00000000000..17d383407a2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-14.c
@@ -0,0 +1,38 @@
+#include <omp.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  int d = omp_get_default_device ();
+  int id = omp_get_initial_device ();
+  int err;
+  void *p;
+
+  if (d < 0 || d >= omp_get_num_devices ())
+    d = id;
+
+  p = omp_target_alloc (128 * sizeof (int), d);
+  if (p == NULL)
+    return 0;
+
+  #pragma omp target is_device_ptr (p) if (d >= 0) device (d >= 0 ? d : 0)
+  {
+    int i, *q = (int *) p;
+    for (i = 0; i < 128; i++)
+      q[i] = i + 7;
+  }
+  #pragma omp target is_device_ptr (p) if (d >= 0) device (d >= 0 ? d : 0) map(from:err)
+  {
+    int i;
+    err = 0;
+    for (i = 0; i < 128; i++)
+      if (((int *) p)[i] != i + 7)
+	err = 1;
+  }
+  if (err)
+    abort ();
+
+  omp_target_free (p, d);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-15.c b/libgomp/testsuite/libgomp.c/target-15.c
new file mode 100644
index 00000000000..fee9252ef3d
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-15.c
@@ -0,0 +1,74 @@
+extern void abort (void);
+
+void
+foo (int *x)
+{
+  int a[10], b[15], err, i;
+  for (i = 0; i < 10; i++)
+    a[i] = 7 * i;
+  for (i = 0; i < 15; i++)
+    b[i] = 8 * i;
+  #pragma omp target map(to:x[5:10], a[0:10], b[5:10]) map(from:err)
+  {
+    err = 0;
+    for (i = 0; i < 10; i++)
+      if (x[5 + i] != 20 + 4 * i
+	  || a[i] != 7 * i
+	  || b[5 + i] != 40 + 8 * i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+}
+
+void
+bar (int n, int v)
+{
+  int a[n], b[n], c[n], d[n], e[n], err, i;
+  int (*x)[n] = &c;
+  for (i = 0; i < n; i++)
+    {
+      (*x)[i] = 4 * i;
+      a[i] = 7 * i;
+      b[i] = 8 * i;
+    }
+  #pragma omp target map(to:x[0][5:10], a[0:10], b[5:10]) map(from:err)
+  {
+    err = 0;
+    for (i = 0; i < 10; i++)
+      if ((*x)[5 + i] != 20 + 4 * i
+	  || a[i] != 7 * i
+	  || b[5 + i] != 40 + 8 * i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+  for (i = 0; i < n; i++)
+    {
+      (*x)[i] = 9 * i;
+      a[i] = 12 * i;
+      b[i] = 13 * i;
+    }
+  #pragma omp target map(to:x[0][v:v+5], a[v-5:v+5], b[v:v+5]) map(from:err)
+  {
+    err = 0;
+    for (i = 0; i < 10; i++)
+      if ((*x)[5 + i] != 45 + 9 * i
+	  || a[i] != 12 * i
+	  || b[5 + i] != 65 + 13 * i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+}
+
+int
+main ()
+{
+  int x[15], i;
+  for (i = 0; i < 15; i++)
+    x[i] = 4 * i;
+  foo (x);
+  bar (15, 5);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-16.c b/libgomp/testsuite/libgomp.c/target-16.c
new file mode 100644
index 00000000000..7b0919b1b00
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-16.c
@@ -0,0 +1,45 @@
+extern void abort (void);
+
+void
+foo (int n)
+{
+  int a[n], i, err;
+  for (i = 0; i < n; i++)
+    a[i] = 7 * i;
+  #pragma omp target firstprivate (a) map(from:err) private (i)
+  {
+    err = 0;
+    for (i = 0; i < n; i++)
+      if (a[i] != 7 * i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+}
+
+void
+bar (int n)
+{
+  int a[n], i, err;
+  #pragma omp target private (a) map(from:err)
+  {
+    #pragma omp parallel for
+    for (i = 0; i < n; i++)
+      a[i] = 7 * i;
+    err = 0;
+    #pragma omp parallel for reduction(|:err)
+    for (i = 0; i < n; i++)
+      if (a[i] != 7 * i)
+	err |= 1;
+  }
+  if (err)
+    abort ();
+}
+
+int
+main ()
+{
+  foo (7);
+  bar (7);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-17.c b/libgomp/testsuite/libgomp.c/target-17.c
new file mode 100644
index 00000000000..4a762012eaf
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-17.c
@@ -0,0 +1,99 @@
+extern void abort (void);
+
+void
+foo (int n)
+{
+  int a[n], i, err;
+  for (i = 0; i < n; i++)
+    a[i] = 5 * i;
+  #pragma omp target map(to:a) map(from:err) private(i)
+  {
+    err = 0;
+    for (i = 0; i < n; i++)
+      if (a[i] != 5 * i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+  for (i = 0; i < n; i++)
+    a[i] += i;
+  #pragma omp target map(from:err) private(i)
+  {
+    err = 0;
+    for (i = 0; i < n; i++)
+      if (a[i] != 6 * i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+  for (i = 0; i < n; i++)
+    a[i] += i;
+  #pragma omp target firstprivate (a) map(from:err) private(i)
+  {
+    err = 0;
+    for (i = 0; i < n; i++)
+      if (a[i] != 7 * i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+  int on = n;
+  #pragma omp target firstprivate (n) map(tofrom: n)
+  {
+    n++;
+  }
+  if (on != n)
+    abort ();
+  #pragma omp target map(tofrom: n) private (n)
+  {
+    n = 25;
+  }
+  if (on != n)
+    abort ();
+  for (i = 0; i < n; i++)
+    a[i] += i;
+  #pragma omp target map(to:a) firstprivate (a) map(from:err) private(i)
+  {
+    err = 0;
+    for (i = 0; i < n; i++)
+      if (a[i] != 8 * i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+  for (i = 0; i < n; i++)
+    a[i] += i;
+  #pragma omp target firstprivate (a) map(to:a) map(from:err) private(i)
+  {
+    err = 0;
+    for (i = 0; i < n; i++)
+      if (a[i] != 9 * i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+  for (i = 0; i < n; i++)
+    a[i] += i;
+  #pragma omp target map(tofrom:a) map(from:err) private(a, i)
+  {
+    err = 0;
+    for (i = 0; i < n; i++)
+      a[i] = 7;
+    #pragma omp parallel for reduction(|:err)
+    for (i = 0; i < n; i++)
+      if (a[i] != 7)
+	err |= 1;
+  }
+  if (err)
+    abort ();
+  for (i = 0; i < n; i++)
+    if (a[i] != 10 * i)
+      abort ();
+}
+
+int
+main ()
+{
+  foo (9);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-18.c b/libgomp/testsuite/libgomp.c/target-18.c
new file mode 100644
index 00000000000..cbacaf6a77a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-18.c
@@ -0,0 +1,52 @@
+extern void abort (void);
+
+void
+foo (int n)
+{
+  int a[4] = { 0, 1, 2, 3 }, b[n];
+  int *p = a + 1, i, err;
+  for (i = 0; i < n; i++)
+    b[i] = 9 + i;
+  #pragma omp target data map(to:a)
+  #pragma omp target data use_device_ptr(p) map(from:err)
+  #pragma omp target is_device_ptr(p) private(i) map(from:err)
+  {
+    err = 0;
+    for (i = 0; i < 4; i++)
+      if (p[i - 1] != i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+  for (i = 0; i < 4; i++)
+    a[i] = 23 + i;
+  #pragma omp target data map(to:a)
+  #pragma omp target data use_device_ptr(a) map(from:err)
+  #pragma omp target is_device_ptr(a) private(i) map(from:err)
+  {
+    err = 0;
+    for (i = 0; i < 4; i++)
+      if (a[i] != 23 + i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+  #pragma omp target data map(to:b)
+  #pragma omp target data use_device_ptr(b) map(from:err)
+  #pragma omp target is_device_ptr(b) private(i) map(from:err)
+  {
+    err = 0;
+    for (i = 0; i < 4; i++)
+      if (b[i] != 9 + i)
+	err = 1;
+  }
+  if (err)
+    abort ();
+}
+
+int
+main ()
+{
+  foo (9);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-19.c b/libgomp/testsuite/libgomp.c/target-19.c
new file mode 100644
index 00000000000..710c5078ff6
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-19.c
@@ -0,0 +1,127 @@
+extern void abort (void);
+
+void
+foo (int *p, int *q, int *r, int n, int m)
+{
+  int i, err, *s = r;
+  #pragma omp target data map(to:p[0:8])
+  {
+    /* For zero length array sections, p points to the start of
+       already mapped range, q to the end of it, and r does not point
+       to an mapped range.  */
+    #pragma omp target map(alloc:p[:0]) map(to:q[:0]) map(from:r[:0]) private(i) map(from:err) firstprivate (s)
+    {
+      err = 0;
+      for (i = 0; i < 8; i++)
+	if (p[i] != i + 1 || q[i - 8] != i + 1)
+	  err = 1;
+      if (p + 8 != q || (r != (int *) 0 && r != s))
+	err = 1;
+    }
+    if (err)
+      abort ();
+    /* Implicit mapping of pointers behaves the same way.  */
+    #pragma omp target private(i) map(from:err) firstprivate (s)
+    {
+      err = 0;
+      for (i = 0; i < 8; i++)
+	if (p[i] != i + 1 || q[i - 8] != i + 1)
+	  err = 1;
+      if (p + 8 != q || (r != (int *) 0 && r != s))
+	err = 1;
+    }
+    if (err)
+      abort ();
+    /* And zero-length array sections, though not known at compile
+       time, behave the same.  */
+    #pragma omp target map(p[:n]) map(tofrom:q[:n]) map(alloc:r[:n]) private(i) map(from:err) firstprivate (s)
+    {
+      err = 0;
+      for (i = 0; i < 8; i++)
+	if (p[i] != i + 1 || q[i - 8] != i + 1)
+	  err = 1;
+      if (p + 8 != q || (r != (int *) 0 && r != s))
+	err = 1;
+    }
+    if (err)
+      abort ();
+    /* Non-zero length array sections, though not known at compile,
+       behave differently.  */
+    #pragma omp target map(p[:m]) map(tofrom:q[:m]) map(to:r[:m]) private(i) map(from:err)
+    {
+      err = 0;
+      for (i = 0; i < 8; i++)
+	if (p[i] != i + 1)
+	  err = 1;
+      if (q[0] != 9 || r[0] != 10)
+	err = 1;
+    }
+    if (err)
+      abort ();
+    #pragma omp target data map(to:q[0:1])
+    {
+      /* For zero length array sections, p points to the start of
+	 already mapped range, q points to the start of another one,
+	 and r to the end of the second one.  */
+      #pragma omp target map(to:p[:0]) map(from:q[:0]) map(tofrom:r[:0]) private(i) map(from:err)
+      {
+	err = 0;
+	for (i = 0; i < 8; i++)
+	  if (p[i] != i + 1)
+	    err = 1;
+	if (q[0] != 9 || r != q + 1)
+	  err = 1;
+      }
+      if (err)
+	abort ();
+      /* Implicit mapping of pointers behaves the same way.  */
+      #pragma omp target private(i) map(from:err)
+      {
+	err = 0;
+	for (i = 0; i < 8; i++)
+	  if (p[i] != i + 1)
+	    err = 1;
+	if (q[0] != 9 || r != q + 1)
+	  err = 1;
+      }
+      if (err)
+	abort ();
+      /* And zero-length array sections, though not known at compile
+	 time, behave the same.  */
+      #pragma omp target map(p[:n]) map(alloc:q[:n]) map(from:r[:n]) private(i) map(from:err)
+      {
+	err = 0;
+	for (i = 0; i < 8; i++)
+	  if (p[i] != i + 1)
+	    err = 1;
+	if (q[0] != 9 || r != q + 1)
+	  err = 1;
+      }
+      if (err)
+	abort ();
+      /* Non-zero length array sections, though not known at compile,
+	 behave differently.  */
+      #pragma omp target map(p[:m]) map(alloc:q[:m]) map(tofrom:r[:m]) private(i) map(from:err)
+      {
+	err = 0;
+	for (i = 0; i < 8; i++)
+	  if (p[i] != i + 1)
+	    err = 1;
+	if (q[0] != 9 || r[0] != 10)
+	  err = 1;
+      }
+      if (err)
+	abort ();
+    }
+  }
+}
+
+int
+main ()
+{
+  int a[32], i;
+  for (i = 0; i < 32; i++)
+    a[i] = i;
+  foo (a + 1, a + 9, a + 10, 0, 1);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-2.c b/libgomp/testsuite/libgomp.c/target-2.c
index ada8dad81ad..0ba766c0a82 100644
--- a/libgomp/testsuite/libgomp.c/target-2.c
+++ b/libgomp/testsuite/libgomp.c/target-2.c
@@ -23,7 +23,7 @@ fn2 (int x)
   int i;
   fn1 (b, c, x);
   fn1 (e, d + x, x);
-  #pragma omp target map(to: b, c[:x], d[x:x], e)
+  #pragma omp target map(to: b, c[:x], d[x:x], e) map(tofrom: s)
     #pragma omp parallel for reduction(+:s)
       for (i = 0; i < x; i++)
 	s += b[i] * c[i] + d[x + i] + sizeof (b) - sizeof (c);
@@ -38,7 +38,7 @@ fn3 (int x)
   int i;
   fn1 (b, c, x);
   fn1 (e, d, x);
-  #pragma omp target
+  #pragma omp target map(tofrom: s)
     #pragma omp parallel for reduction(+:s)
       for (i = 0; i < x; i++)
 	s += b[i] * c[i] + d[i];
@@ -56,7 +56,7 @@ fn4 (int x)
   #pragma omp target data map(from: b, c[:x], d[x:x], e)
     {
       #pragma omp target update to(b, c[:x], d[x:x], e)
-      #pragma omp target map(c[:x], d[x:x])
+      #pragma omp target map(c[:x], d[x:x], s)
 	#pragma omp parallel for reduction(+:s)
 	  for (i = 0; i < x; i++)
 	    {
diff --git a/libgomp/testsuite/libgomp.c/target-20.c b/libgomp/testsuite/libgomp.c/target-20.c
new file mode 100644
index 00000000000..3f4e798a755
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-20.c
@@ -0,0 +1,120 @@
+/* { dg-require-effective-target offload_device_nonshared_as } */
+
+#include <stdlib.h>
+#include <assert.h>
+
+#define N 40
+
+int sum;
+int var1 = 1;
+int var2 = 2;
+
+#pragma omp declare target
+int D[N];
+#pragma omp end declare target
+
+void enter_data (int *X)
+{
+  #pragma omp target enter data map(to: var1, var2, X[:N]) map(alloc: sum)
+}
+
+void exit_data_0 (int *D)
+{
+  #pragma omp target exit data map(delete: D[:N])
+}
+
+void exit_data_1 ()
+{
+  #pragma omp target exit data map(from: var1)
+}
+
+void exit_data_2 (int *X)
+{
+  #pragma omp target exit data map(from: var2) map(release: X[:N], sum)
+}
+
+void exit_data_3 (int *p)
+{
+  #pragma omp target exit data map(from: p[:0])
+}
+
+void test_nested ()
+{
+  int X = 0, Y = 0, Z = 0;
+
+  #pragma omp target data map(from: X, Y, Z)
+    {
+      #pragma omp target data map(from: X, Y, Z)
+	{
+	  #pragma omp target map(from: X, Y, Z)
+	    X = Y = Z = 1337;
+	  assert (X == 0);
+	  assert (Y == 0);
+	  assert (Z == 0);
+
+	  #pragma omp target exit data map(from: X) map(release: Y)
+	  assert (X == 0);
+	  assert (Y == 0);
+
+	  #pragma omp target exit data map(release: Y) map(delete: Z)
+	  assert (Y == 0);
+	  assert (Z == 0);
+	}
+      assert (X == 1337);
+      assert (Y == 0);
+      assert (Z == 0);
+
+      #pragma omp target map(from: X)
+	X = 2448;
+      assert (X == 2448);
+      assert (Y == 0);
+      assert (Z == 0);
+
+      X = 4896;
+    }
+  assert (X == 4896);
+  assert (Y == 0);
+  assert (Z == 0);
+}
+
+int main ()
+{
+  int *X = malloc (N * sizeof (int));
+  int *Y = malloc (N * sizeof (int));
+  X[10] = 10;
+  Y[20] = 20;
+  enter_data (X);
+
+  exit_data_0 (D); /* This should have no effect on D.  */
+
+  #pragma omp target map(alloc: var1, var2, X[:N]) map(to: Y[:N]) map(always from: sum)
+    {
+      var1 += X[10];
+      var2 += Y[20];
+      sum = var1 + var2;
+      D[sum]++;
+    }
+
+  assert (var1 == 1);
+  assert (var2 == 2);
+  assert (sum == 33);
+
+  exit_data_1 ();
+  assert (var1 == 11);
+  assert (var2 == 2);
+
+  /* Increase refcount of already mapped X[0:N].  */
+  #pragma omp target enter data map(alloc: X[16:1])
+
+  exit_data_2 (X);
+  assert (var2 == 22);
+
+  exit_data_3 (X + 5); /* Unmap X[0:N].  */
+
+  free (X);
+  free (Y);
+
+  test_nested ();
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-21.c b/libgomp/testsuite/libgomp.c/target-21.c
new file mode 100644
index 00000000000..41498cf2148
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-21.c
@@ -0,0 +1,79 @@
+extern void abort (void);
+union U { int x; long long y; };
+struct T { int a; union U b; int c; };
+struct S { int s; int u; struct T v; int x[10]; union U w; int y[10]; int z[10]; };
+volatile int z;
+
+int
+main ()
+{
+  struct S s;
+  s.s = 0;
+  s.u = 1;
+  s.v.a = 2;
+  s.v.b.y = 3LL;
+  s.v.c = 19;
+  s.w.x = 4;
+  s.x[0] = 7;
+  s.x[1] = 8;
+  s.y[3] = 9;
+  s.y[4] = 10;
+  s.y[5] = 11;
+  int err = 0;
+  #pragma omp target map (to:s.v.b, s.u, s.x[0:z + 2]) \
+		     map (tofrom:s.y[3:3]) \
+		     map (from: s.w, s.z[z + 1:z + 3], err)
+  {
+    err = 0;
+    if (s.u != 1 || s.v.b.y != 3LL || s.x[0] != 7 || s.x[1] != 8
+	|| s.y[3] != 9 || s.y[4] != 10 || s.y[5] != 11)
+      err = 1;
+    s.w.x = 6;
+    s.y[3] = 12;
+    s.y[4] = 13;
+    s.y[5] = 14;
+    s.z[1] = 15;
+    s.z[2] = 16;
+    s.z[3] = 17;
+  }
+  if (err || s.w.x != 6 || s.y[3] != 12 || s.y[4] != 13 || s.y[5] != 14
+      || s.z[1] != 15 || s.z[2] != 16 || s.z[3] != 17)
+    abort ();
+  s.u++;
+  s.v.a++;
+  s.v.b.y++;
+  s.w.x++;
+  s.x[1] = 18;
+  s.z[0] = 19;
+  #pragma omp target data map (tofrom: s)
+  #pragma omp target map (always to: s.w, s.x[1], err) map (alloc:s.u, s.v.b, s.z[z:z + 1])
+  {
+    err = 0;
+    if (s.u != 2 || s.v.b.y != 4LL || s.w.x != 7 || s.x[1] != 18 || s.z[0] != 19)
+      err = 1;
+    s.w.x = 8;
+    s.x[1] = 20;
+    s.z[0] = 21;
+  }
+  if (err || s.w.x != 8 || s.x[1] != 20 || s.z[0] != 21)
+    abort ();
+  s.u++;
+  s.v.a++;
+  s.v.b.y++;
+  s.w.x++;
+  s.x[0] = 22;
+  s.x[1] = 23;
+  #pragma omp target data map (from: s.w, s.x[0:2]) map (to: s.v.b, s.u)
+  #pragma omp target map (always to: s.w, s.x[0:2], err) map (alloc:s.u, s.v.b)
+  {
+    err = 0;
+    if (s.u != 3 || s.v.b.y != 5LL || s.w.x != 9 || s.x[0] != 22 || s.x[1] != 23)
+      err = 1;
+    s.w.x = 11;
+    s.x[0] = 24;
+    s.x[1] = 25;
+  }
+  if (err || s.w.x != 11 || s.x[0] != 24 || s.x[1] != 25)
+    abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-22.c b/libgomp/testsuite/libgomp.c/target-22.c
new file mode 100644
index 00000000000..aad8a0a09df
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-22.c
@@ -0,0 +1,51 @@
+extern void abort (void);
+struct T { int a; int *b; int c; };
+struct S { int *s; char *u; struct T v; short *w; };
+volatile int z;
+
+int
+main ()
+{
+  struct S s;
+  int a[32], i;
+  char b[32];
+  short c[32];
+  for (i = 0; i < 32; i++)
+    {
+      a[i] = i;
+      b[i] = 32 + i;
+      c[i] = 64 + i;
+    }
+  s.s = a;
+  s.u = b + 2;
+  s.v.b = a + 16;
+  s.w = c + 3;
+  int err = 0;
+  #pragma omp target map (to:s.v.b[0:z + 7], s.u[z + 1:z + 4]) \
+		     map (tofrom:s.s[3:3]) \
+		     map (from: s.w[z:4], err) private (i)
+  {
+    err = 0;
+    for (i = 0; i < 7; i++)
+      if (s.v.b[i] != 16 + i)
+	err = 1;
+    for (i = 1; i < 5; i++)
+      if (s.u[i] != 34 + i)
+	err = 1;
+    for (i = 3; i < 6; i++)
+      if (s.s[i] != i)
+	err = 1;
+      else
+	s.s[i] = 128 + i;
+    for (i = 0; i < 4; i++)
+      s.w[i] = 96 + i;
+  }
+  if (err)
+    abort ();
+  for (i = 0; i < 32; i++)
+    if (a[i] != ((i >= 3 && i < 6) ? 128 + i : i)
+	|| b[i] != 32 + i
+	|| c[i] != ((i >= 3 && i < 7) ? 93 + i : 64 + i))
+      abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-23.c b/libgomp/testsuite/libgomp.c/target-23.c
new file mode 100644
index 00000000000..fb1532a07b2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-23.c
@@ -0,0 +1,48 @@
+extern void abort (void);
+struct S { int s; int *u; int v[5]; };
+volatile int z;
+
+int
+main ()
+{
+  int u[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, err = 0;
+  struct S s = { 9, u + 3, { 10, 11, 12, 13, 14 } };
+  int *v = u + 4;
+  #pragma omp target enter data map (to: s.s, s.u[0:5]) map (alloc: s.v[1:3])
+  s.s++;
+  u[3]++;
+  s.v[1]++;
+  #pragma omp target update to (s.s) to (s.u[0:2], s.v[1:3])
+  #pragma omp target map (alloc: s.s, s.v[1:3]) map (from: err)
+  {
+    err = 0;
+    if (s.s != 10 || s.v[1] != 12 || s.v[2] != 12 || s.v[3] != 13)
+      err = 1;
+    if (v[-1] != 4 || v[0] != 4 || v[1] != 5 || v[2] != 6 || v[3] != 7)
+      err = 1;
+    s.s++;
+    s.v[2] += 2;
+    v[-1] = 5;
+    v[3] = 9;
+  }
+  if (err)
+    abort ();
+  #pragma omp target map (alloc: s.u[0:5])
+  {
+    err = 0;
+    if (s.u[0] != 5 || s.u[1] != 4 || s.u[2] != 5 || s.u[3] != 6 || s.u[4] != 9)
+      err = 1;
+    s.u[1] = 12;
+  }
+  #pragma omp target update from (s.s, s.u[0:5]) from (s.v[1:3])
+  if (err || s.s != 11 || u[0] != 0 || u[1] != 1 || u[2] != 2 || u[3] != 5
+      || u[4] != 12 || u[5] != 5 || u[6] != 6 || u[7] != 9 || u[8] != 8
+      || u[9] != 9 || s.v[0] != 10 || s.v[1] != 12 || s.v[2] != 14
+      || s.v[3] != 13 || s.v[4] != 14)
+    abort ();
+  #pragma omp target exit data map (release: s.s)
+  #pragma omp target exit data map (release: s.u[0:5])
+  #pragma omp target exit data map (delete: s.v[1:3])
+  #pragma omp target exit data map (release: s.s)
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-24.c b/libgomp/testsuite/libgomp.c/target-24.c
new file mode 100644
index 00000000000..e0ff29aaee8
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-24.c
@@ -0,0 +1,43 @@
+#include <omp.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  int d = omp_get_default_device ();
+  int id = omp_get_initial_device ();
+
+  if (d < 0 || d >= omp_get_num_devices ())
+    d = id;
+
+  int a[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  int *b = a;
+  int shared_mem = 0;
+  #pragma omp target map (alloc: shared_mem)
+  shared_mem = 1;
+  if (omp_target_is_present (b, d) != shared_mem)
+    abort ();
+  #pragma omp target enter data map (to: a)
+  if (omp_target_is_present (b, d) == 0)
+    abort ();
+  #pragma omp target enter data map (alloc: b[:0])
+  if (omp_target_is_present (b, d) == 0)
+    abort ();
+  #pragma omp target exit data map (release: b[:0])
+  if (omp_target_is_present (b, d) == 0)
+    abort ();
+  #pragma omp target exit data map (release: b[:0])
+  if (omp_target_is_present (b, d) != shared_mem)
+    abort ();
+  #pragma omp target enter data map (to: a)
+  if (omp_target_is_present (b, d) == 0)
+    abort ();
+  #pragma omp target enter data map (always, to: b[:0])
+  if (omp_target_is_present (b, d) == 0)
+    abort ();
+  #pragma omp target exit data map (delete: b[:0])
+  if (omp_target_is_present (b, d) != shared_mem)
+    abort ();
+  #pragma omp target exit data map (from: b[:0])
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-25.c b/libgomp/testsuite/libgomp.c/target-25.c
new file mode 100644
index 00000000000..aeb19aee510
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-25.c
@@ -0,0 +1,84 @@
+#include <stdlib.h>
+#include <unistd.h>
+
+int
+main ()
+{
+  int x = 0, y = 0, z = 0, s = 11, t = 12, u = 13, w = 7, err;
+  #pragma omp parallel
+  #pragma omp single
+  {
+    #pragma omp task depend(in: x)
+    {
+      usleep (5000);
+      x = 1;
+    }
+    #pragma omp task depend(in: x)
+    {
+      usleep (6000);
+      y = 2;
+    }
+    #pragma omp task depend(out: z)
+    {
+      usleep (7000);
+      z = 3;
+    }
+    #pragma omp target map(tofrom: x) map(from: err) firstprivate (y) depend(inout: x, z)
+    err = (x != 1 || y != 2 || z != 3);
+    if (err)
+      abort ();
+    #pragma omp task depend(in: x)
+    {
+      usleep (5000);
+      x = 4;
+    }
+    #pragma omp task depend(in: x)
+    {
+      usleep (4000);
+      y = 5;
+    }
+    #pragma omp task depend(in: z)
+    {
+      usleep (3000);
+      z = 6;
+    }
+    #pragma omp target enter data nowait map (to: w)
+    #pragma omp target enter data depend (inout: x, z) map (to: x, y, z)
+    #pragma omp target map (alloc: x, y, z) map(from: err)
+    {
+      err = (x != 4 || y != 5 || z != 6);
+      x = 7;
+      y = 8;
+      z = 9;
+    }
+    if (err)
+      abort ();
+    #pragma omp taskwait
+    #pragma omp target map (alloc: w) map(from: err)
+    {
+      err = w != 7;
+      w = 17;
+    }
+    if (err)
+      abort (); 
+    #pragma omp task depend(in: x)
+    {
+      usleep (2000);
+      s = 14;
+    }
+    #pragma omp task depend(in: x)
+    {
+      usleep (3000);
+      t = 15;
+    }
+    #pragma omp task depend(in: z)
+    {
+      usleep (4000);
+      u = 16;
+    }
+    #pragma omp target exit data depend (inout: x, z) map (from: x, y, z, w)
+    if (x != 7 || y != 8 || z != 9 || s != 14 || t != 15 || u != 16 || w != 17)
+      abort ();
+  }
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-26.c b/libgomp/testsuite/libgomp.c/target-26.c
new file mode 100644
index 00000000000..fa6b52598da
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-26.c
@@ -0,0 +1,36 @@
+extern void abort (void);
+#pragma omp declare target
+int a[4] = { 2, 3, 4, 5 }, *b;
+#pragma omp end declare target
+
+int
+main ()
+{
+  int err;
+  int c[3] = { 6, 7, 8 };
+  b = c;
+  #pragma omp target map(to: a[0:2], b[0:2]) map(from: err)
+  err = a[0] != 2 || a[1] != 3 || a[2] != 4 || a[3] != 5 || b[0] != 6 || b[1] != 7;
+  if (err)
+    abort ();
+  a[1] = 9;
+  a[2] = 10;
+  #pragma omp target map(always,to:a[1:2]) map(from: err)
+  err = a[0] != 2 || a[1] != 9 || a[2] != 10 || a[3] != 5;
+  if (err)
+    abort ();
+  #pragma omp parallel firstprivate(a, b, c, err) num_threads (2)
+  #pragma omp single
+  {
+    b = c + 1;
+    a[0] = 11;
+    a[2] = 13;
+    c[1] = 14;
+    int d = 0;
+    #pragma omp target map(to: a[0:3], b[d:2]) map (from: err)
+    err = a[0] != 11 || a[1] != 9 || a[2] != 13 || b[0] != 14 || b[1] != 8;
+    if (err)
+      abort ();
+  }
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-27.c b/libgomp/testsuite/libgomp.c/target-27.c
new file mode 100644
index 00000000000..c86651b02e3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-27.c
@@ -0,0 +1,67 @@
+#include <stdlib.h>
+#include <unistd.h>
+
+int
+main ()
+{
+  int x = 0, y = 0, z = 0, err;
+  int shared_mem = 0;
+  #pragma omp target map(to: shared_mem)
+  shared_mem = 1;
+  #pragma omp parallel
+  #pragma omp single
+  {
+    #pragma omp task depend(in: x)
+    {
+      usleep (5000);
+      x = 1;
+    }
+    #pragma omp task depend(in: x)
+    {
+      usleep (6000);
+      y = 2;
+    }
+    #pragma omp task depend(out: z)
+    {
+      usleep (7000);
+      z = 3;
+    }
+    #pragma omp target enter data map(to: x, y, z) depend(inout: x, z) nowait
+    #pragma omp task depend(inout: x, z)
+    {
+      x++; y++; z++;
+    }
+    #pragma omp target update to(x, y) depend(inout: x) nowait
+    #pragma omp target enter data map(always, to: z) depend(inout: z) nowait
+    #pragma omp target map (alloc: x, y, z) map (from: err) depend(inout: x, z)
+    {
+      err = x != 2 || y != 3 || z != 4;
+      x = 5; y = 6; z = 7;
+    }
+    #pragma omp task depend(in: x)
+    {
+      usleep (5000);
+      if (!shared_mem)
+	x = 1;
+    }
+    #pragma omp task depend(in: x)
+    {
+      usleep (6000);
+      if (!shared_mem)
+	y = 2;
+    }
+    #pragma omp task depend(out: z)
+    {
+      usleep (3000);
+      if (!shared_mem)
+	z = 3;
+    }
+    #pragma omp target exit data map(release: z) depend(inout: z) nowait
+    #pragma omp target exit data map(from: x, y) depend(inout: x) nowait
+    #pragma omp target exit data map(from: z) depend(inout: z) nowait
+    #pragma omp taskwait
+    if (err || x != 5 || y != 6 || z != 7)
+      abort ();
+  }
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/target-7.c b/libgomp/testsuite/libgomp.c/target-7.c
index 0fe6150283d..41a1332bb0c 100644
--- a/libgomp/testsuite/libgomp.c/target-7.c
+++ b/libgomp/testsuite/libgomp.c/target-7.c
@@ -37,63 +37,63 @@ foo (int f)
     abort ();
   #pragma omp target data device (d) map (to: h)
   {
-    #pragma omp target device (d)
+    #pragma omp target device (d) map (h)
     if (omp_get_level () != 0 || (f && !omp_is_initial_device ()) || h++ != 5)
       abort ();
     #pragma omp target update device (d) from (h)
   }
   #pragma omp target data if (v > 1) map (to: h)
   {
-    #pragma omp target if (v > 1)
+    #pragma omp target if (v > 1) map(h)
     if (omp_get_level () != 0 || !omp_is_initial_device () || h++ != 6)
       abort ();
     #pragma omp target update if (v > 1) from (h)
   }
   #pragma omp target data device (d) if (v > 1) map (to: h)
   {
-    #pragma omp target device (d) if (v > 1)
+    #pragma omp target device (d) if (v > 1) map(h)
     if (omp_get_level () != 0 || !omp_is_initial_device () || h++ != 7)
       abort ();
     #pragma omp target update device (d) if (v > 1) from (h)
   }
   #pragma omp target data if (v <= 1) map (to: h)
   {
-    #pragma omp target if (v <= 1)
+    #pragma omp target if (v <= 1) map (tofrom: h)
     if (omp_get_level () != 0 || h++ != 8)
       abort ();
     #pragma omp target update if (v <= 1) from (h)
   }
   #pragma omp target data device (d) if (v <= 1) map (to: h)
   {
-    #pragma omp target device (d) if (v <= 1)
+    #pragma omp target device (d) if (v <= 1) map (h)
     if (omp_get_level () != 0 || (f && !omp_is_initial_device ()) || h++ != 9)
       abort ();
     #pragma omp target update device (d) if (v <= 1) from (h)
   }
   #pragma omp target data if (0) map (to: h)
   {
-    #pragma omp target if (0)
+    #pragma omp target if (0) map (h)
     if (omp_get_level () != 0 || !omp_is_initial_device () || h++ != 10)
       abort ();
     #pragma omp target update if (0) from (h)
   }
   #pragma omp target data device (d) if (0) map (to: h)
   {
-    #pragma omp target device (d) if (0)
+    #pragma omp target device (d) if (0) map (h)
     if (omp_get_level () != 0 || !omp_is_initial_device () || h++ != 11)
       abort ();
     #pragma omp target update device (d) if (0) from (h)
   }
   #pragma omp target data if (1) map (to: h)
   {
-    #pragma omp target if (1)
+    #pragma omp target if (1) map (tofrom: h)
     if (omp_get_level () != 0 || h++ != 12)
       abort ();
     #pragma omp target update if (1) from (h)
   }
   #pragma omp target data device (d) if (1) map (to: h)
   {
-    #pragma omp target device (d) if (1)
+    #pragma omp target device (d) if (1) map (tofrom: h)
     if (omp_get_level () != 0 || (f && !omp_is_initial_device ()) || h++ != 13)
       abort ();
     #pragma omp target update device (d) if (1) from (h)
diff --git a/libgomp/testsuite/libgomp.c/taskloop-1.c b/libgomp/testsuite/libgomp.c/taskloop-1.c
new file mode 100644
index 00000000000..21551f2950c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/taskloop-1.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fopenmp -std=c99" } */
+
+int q, r, e;
+
+__attribute__((noinline, noclone)) void
+foo (long a, long b)
+{
+  #pragma omp taskloop lastprivate (q) nogroup
+    for (long d = a; d < b; d += 2)
+      {
+	q = d;
+	if (d < 2 || d > 6 || (d & 1))
+	  #pragma omp atomic
+	    e |= 1;
+      }
+}
+
+__attribute__((noinline, noclone)) int
+bar (int a, int b)
+{
+  int q = 7;
+  #pragma omp taskloop lastprivate (q)
+    for (int d = a; d < b; d++)
+      {
+	if (d < 12 || d > 17)
+	  #pragma omp atomic
+	    e |= 1;
+	q = d;
+      }
+  return q;
+}
+
+int
+main ()
+{
+  #pragma omp parallel
+    #pragma omp single
+      {
+	foo (2, 7);
+	r = bar (12, 18);
+      }
+  if (q != 6 || r != 17 || e)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/taskloop-2.c b/libgomp/testsuite/libgomp.c/taskloop-2.c
new file mode 100644
index 00000000000..be893ebf80a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/taskloop-2.c
@@ -0,0 +1,147 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -std=c99" } */
+/* { dg-additional-options "-msse2" { target sse2_runtime } } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+int u[1024], v[1024], w[1024], m;
+
+__attribute__((noinline, noclone)) void
+f1 (long a, long b)
+{
+  #pragma omp taskloop simd default(none) shared(u, v, w) nogroup
+  for (long d = a; d < b; d++)
+    u[d] = v[d] + w[d];
+}
+
+__attribute__((noinline, noclone)) int
+f2 (long a, long b, long c)
+{
+  int d, e;
+  #pragma omp taskloop simd default(none) shared(u, v, w) linear(d:1) linear(c:5) lastprivate(e)
+  for (d = a; d < b; d++)
+    {
+      u[d] = v[d] + w[d];
+      c = c + 5;
+      e = c + 9;
+    }
+  return d + c + e;
+}
+
+__attribute__((noinline, noclone)) int
+f3 (long a, long b)
+{
+  int d;
+  #pragma omp taskloop simd default(none) shared(u, v, w)
+  for (d = a; d < b; d++)
+    {
+      int *p = &d;
+      u[d] = v[d] + w[d];
+    }
+  return d;
+}
+
+__attribute__((noinline, noclone)) int
+f4 (long a, long b, long c, long d)
+{
+  int e, f, g;
+  #pragma omp taskloop simd default(none) shared(u, v, w) collapse(2) lastprivate(g)
+  for (e = a; e < b; e++)
+    for (f = c; f < d; f++)
+      {
+	int *p = &e;
+	int *q = &f;
+	int r = 32 * e + f;
+	u[r] = v[r] + w[r];
+	g = r;
+      }
+  return e + f + g;
+}
+
+__attribute__((noinline, noclone)) int
+f5 (long a, long b, long c, long d)
+{
+  int e, f;
+  #pragma omp taskloop simd default(none) shared(u, v, w) collapse(2)
+  for (e = a; e < b; e++)
+    for (f = c; f < d; f++)
+      {
+	int r = 32 * e + f;
+	u[r] = v[r] + w[r];
+      }
+  return e + f;
+}
+
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 1024; i++)
+    {
+      v[i] = i;
+      w[i] = i + 1;
+    }
+  #pragma omp parallel
+    #pragma omp single
+      f1 (0, 1024);
+  for (i = 0; i < 1024; i++)
+    if (u[i] != 2 * i + 1)
+      __builtin_abort ();
+    else
+      {
+	v[i] = 1024 - i;
+	w[i] = 512 - i;
+      }
+  #pragma omp parallel
+    #pragma omp single
+      m = f2 (2, 1022, 17);
+  for (i = 0; i < 1024; i++)
+    if ((i < 2 || i >= 1022) ? u[i] != 2 * i + 1 : u[i] != 1536 - 2 * i)
+      __builtin_abort ();
+    else
+      {
+	v[i] = i;
+	w[i] = i + 1;
+      }
+  if (m != 1022 + 2 * (1020 * 5 + 17) + 9)
+    __builtin_abort ();
+  #pragma omp parallel
+    #pragma omp single
+      m = f3 (0, 1024);
+  for (i = 0; i < 1024; i++)
+    if (u[i] != 2 * i + 1)
+      __builtin_abort ();
+    else
+      {
+	v[i] = 1024 - i;
+	w[i] = 512 - i;
+      }
+  if (m != 1024)
+    __builtin_abort ();
+  #pragma omp parallel
+    #pragma omp single
+      m = f4 (0, 32, 0, 32);
+  for (i = 0; i < 1024; i++)
+    if (u[i] != 1536 - 2 * i)
+      __builtin_abort ();
+    else
+      {
+	v[i] = i;
+	w[i] = i + 1;
+      }
+  if (m != 32 + 32 + 1023)
+    __builtin_abort ();
+  #pragma omp parallel
+    #pragma omp single
+      m = f5 (0, 32, 0, 32);
+  for (i = 0; i < 1024; i++)
+    if (u[i] != 2 * i + 1)
+      __builtin_abort ();
+    else
+      {
+	v[i] = 1024 - i;
+	w[i] = 512 - i;
+      }
+  if (m != 32 + 32)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/taskloop-3.c b/libgomp/testsuite/libgomp.c/taskloop-3.c
new file mode 100644
index 00000000000..5356d7f0251
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/taskloop-3.c
@@ -0,0 +1,84 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fopenmp -std=c99" } */
+
+int g;
+int a[1024];
+
+__attribute__((noinline, noclone)) int
+f1 (int x)
+{
+  #pragma omp taskloop firstprivate (x) lastprivate (x)
+  for (int i = 0; i < 64; i++)
+    {
+      if (x != 74)
+	__builtin_abort ();
+      if (i == 63)
+	x = i + 4;
+    }
+  return x;
+}
+
+__attribute__((noinline, noclone)) void
+f2 (void)
+{
+  #pragma omp taskloop firstprivate (g) lastprivate (g) nogroup
+  for (int i = 0; i < 64; i++)
+    {
+      if (g != 77)
+	__builtin_abort ();
+      if (i == 63)
+	g = i + 9;
+    }
+}
+
+__attribute__((noinline, noclone)) long long
+f3 (long long a, long long b, long long c)
+{
+  long long i;
+  int l;
+  #pragma omp taskloop default (none) lastprivate (i, l)
+  for (i = a; i < b; i += c)
+    l = i;
+  return l * 7 + i;
+}
+
+__attribute__((noinline, noclone)) long long
+f4 (long long a, long long b, long long c, long long d,
+    long long e, long long f, int k)
+{
+  long long i, j;
+  int l;
+  #pragma omp taskloop default (none) collapse(2) \
+	      firstprivate (k) lastprivate (i, j, k, l)
+  for (i = a; i < b; i += e)
+    for (j = c; j < d; j += f)
+      {
+	if (k != 73)
+	  __builtin_abort ();
+	if (i == 31 && j == 46)
+	  k = i;
+	l = j;
+      }
+  return i + 5 * j + 11 * k + 17 * l;
+}
+
+int
+main ()
+{
+  #pragma omp parallel
+    #pragma omp single
+      {
+	if (f1 (74) != 63 + 4)
+	  __builtin_abort ();
+	g = 77;
+	f2 ();
+	#pragma omp taskwait
+	if (g != 63 + 9)
+	  __builtin_abort ();
+	if (f3 (7, 12, 2) != 11 * 7 + 13)
+	  __builtin_abort ();
+	if (f4 (0, 32, 16, 48, 1, 2, 73) != 32 + 5 * 48 + 11 * 31 + 17 * 46)
+	  __builtin_abort ();
+      }
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c/taskloop-4.c b/libgomp/testsuite/libgomp.c/taskloop-4.c
new file mode 100644
index 00000000000..a69be19c9c2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/taskloop-4.c
@@ -0,0 +1,97 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fopenmp" } */
+
+int u[64], v;
+
+__attribute__((noinline, noclone)) int
+test (int a, int b, int c, int d, void (*fn) (int, int, int, int),
+      int *num_tasks, int *min_iters, int *max_iters)
+{
+  int i, t = 0;
+  __builtin_memset (u, 0, sizeof u);
+  v = 0;
+  fn (a, b, c, d);
+  *min_iters = 0;
+  *max_iters = 0;
+  *num_tasks = v;
+  if (v)
+    {
+      *min_iters = u[0];
+      *max_iters = u[0];
+      t = u[0];
+      for (i = 1; i < v; i++)
+	{
+	  if (*min_iters > u[i])
+	    *min_iters = u[i];
+	  if (*max_iters < u[i])
+	    *max_iters = u[i];
+	  t += u[i];
+	}
+    }
+  return t;
+}
+
+void
+grainsize (int a, int b, int c, int d)
+{
+  int i, j = 0, k = 0;
+  #pragma omp taskloop firstprivate (j, k) grainsize(d)
+  for (i = a; i < b; i += c)
+    {
+      if (j == 0)
+	{
+	  #pragma omp atomic capture
+	    k = v++;
+	  if (k >= 64)
+	    __builtin_abort ();
+	}
+      u[k] = ++j;
+    }
+}
+
+void
+num_tasks (int a, int b, int c, int d)
+{
+  int i, j = 0, k = 0;
+  #pragma omp taskloop firstprivate (j, k) num_tasks(d)
+  for (i = a; i < b; i += c)
+    {
+      if (j == 0)
+	{
+	  #pragma omp atomic capture
+	    k = v++;
+	  if (k >= 64)
+	    __builtin_abort ();
+	}
+      u[k] = ++j;
+    }
+}
+
+int
+main ()
+{
+  #pragma omp parallel
+    #pragma omp single
+      {
+	int min_iters, max_iters, ntasks;
+	/* If grainsize is present, # of task loop iters is >= grainsize && < 2 * grainsize,
+	   unless # of loop iterations is smaller than grainsize.  */
+	if (test (0, 79, 1, 17, grainsize, &ntasks, &min_iters, &max_iters) != 79
+	    || min_iters < 17 || max_iters >= 17 * 2)
+	  __builtin_abort ();
+	if (test (-49, 2541, 7, 28, grainsize, &ntasks, &min_iters, &max_iters) != 370
+	    || min_iters < 28 || max_iters >= 28 * 2)
+	  __builtin_abort ();
+	if (test (7, 21, 2, 15, grainsize, &ntasks, &min_iters, &max_iters) != 7
+	    || ntasks != 1 || min_iters != 7 || max_iters != 7)
+	  __builtin_abort ();
+	/* If num_tasks is present, # of task loop iters is min (# of loop iters, num_tasks).  */
+	if (test (-51, 2500, 48, 9, num_tasks, &ntasks, &min_iters, &max_iters) != 54
+	    || ntasks != 9)
+	  __builtin_abort ();
+	if (test (0, 25, 2, 17, num_tasks, &ntasks, &min_iters, &max_iters) != 13
+	    || ntasks != 13)
+	  __builtin_abort ();
+      }
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.fortran/affinity1.f90 b/libgomp/testsuite/libgomp.fortran/affinity1.f90
new file mode 100644
index 00000000000..26b5185ba3c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/affinity1.f90
@@ -0,0 +1,49 @@
+! { dg-do run }
+! { dg-skip-if "" { ! run_expensive_tests } { "*" } { "-O2" } }
+! { dg-set-target-env-var OMP_PROC_BIND "spread,close" }
+! { dg-set-target-env-var OMP_PLACES "{6,7}:4:-2,!{2,3}" }
+! { dg-set-target-env-var OMP_NUM_THREADS "2" }
+
+  use omp_lib
+  integer :: num, i, nump
+  num = omp_get_num_places ()
+  print *, 'omp_get_num_places () == ', num
+  do i = 0, num - 1
+    nump = omp_get_place_num_procs (place_num = i)
+    if (nump .eq. 0) then
+      print *, 'place ', i, ' {}'
+    else
+      call print_place (i, nump)
+    end if
+  end do
+  call print_place_var
+  call omp_set_nested (nested = .true.)
+  !$omp parallel
+    if (omp_get_thread_num () == omp_get_num_threads () - 1) then
+      !$omp parallel
+        if (omp_get_thread_num () == omp_get_num_threads () - 1) &
+          call print_place_var
+      !$omp end parallel
+    end if
+  !$omp end parallel
+contains
+  subroutine print_place (i, nump)
+    integer, intent (in) :: i, nump
+    integer :: ids(nump)
+    call omp_get_place_proc_ids (place_num = i, ids = ids)
+    print *, 'place ', i, ' {', ids, '}'
+  end subroutine
+  subroutine print_place_var
+    integer :: place, num_places
+    place = omp_get_place_num ()
+    num_places = omp_get_partition_num_places ()
+    print *, 'place ', place
+    if (num_places .gt. 0) call print_partition (num_places)
+  end subroutine
+  subroutine print_partition (num_places)
+    integer, intent (in) :: num_places
+    integer :: place_nums(num_places)
+    call omp_get_partition_place_nums (place_nums = place_nums)
+    print *, 'partition ', place_nums(1), '-', place_nums(num_places)
+  end subroutine
+end
diff --git a/libgomp/testsuite/libgomp.fortran/affinity2.f90 b/libgomp/testsuite/libgomp.fortran/affinity2.f90
new file mode 100644
index 00000000000..338f0e8bb93
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/affinity2.f90
@@ -0,0 +1,8 @@
+! { dg-do run }
+! { dg-additional-options "-fdefault-integer-8" }
+! { dg-skip-if "" { ! run_expensive_tests } { "*" } { "-O2" } }
+! { dg-set-target-env-var OMP_PROC_BIND "spread,close" }
+! { dg-set-target-env-var OMP_PLACES "{6,7}:4:-2,!{2,3}" }
+! { dg-set-target-env-var OMP_NUM_THREADS "2" }
+
+include 'affinity1.f90'