18 files changed, 2007 insertions, 1327 deletions
diff --git a/src/arena.c b/src/arena.c
index d28b629a..4da6d50c 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -38,52 +38,18 @@ const uint8_t	small_size2bin[] = {
 };
 
 /******************************************************************************/
-/* Function prototypes for non-inline static functions. */
-
-static void	arena_avail_insert(arena_t *arena, arena_chunk_t *chunk,
-    size_t pageind, size_t npages, bool maybe_adjac_pred,
-    bool maybe_adjac_succ);
-static void	arena_avail_remove(arena_t *arena, arena_chunk_t *chunk,
-    size_t pageind, size_t npages, bool maybe_adjac_pred,
-    bool maybe_adjac_succ);
-static void	arena_run_split(arena_t *arena, arena_run_t *run, size_t size,
-    bool large, size_t binind, bool zero);
-static arena_chunk_t *arena_chunk_alloc(arena_t *arena);
-static void	arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk);
-static arena_run_t	*arena_run_alloc_helper(arena_t *arena, size_t size,
-    bool large, size_t binind, bool zero);
-static arena_run_t *arena_run_alloc(arena_t *arena, size_t size, bool large,
-    size_t binind, bool zero);
-static arena_chunk_t	*chunks_dirty_iter_cb(arena_chunk_tree_t *tree,
-    arena_chunk_t *chunk, void *arg);
+/*
+ * Function prototypes for static functions that are referenced prior to
+ * definition.
+ */
+
 static void	arena_purge(arena_t *arena, bool all);
 static void	arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty,
     bool cleaned);
-static void	arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk,
-    arena_run_t *run, size_t oldsize, size_t newsize);
-static void	arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk,
-    arena_run_t *run, size_t oldsize, size_t newsize, bool dirty);
-static arena_run_t	*arena_bin_runs_first(arena_bin_t *bin);
-static void	arena_bin_runs_insert(arena_bin_t *bin, arena_run_t *run);
-static void	arena_bin_runs_remove(arena_bin_t *bin, arena_run_t *run);
-static arena_run_t *arena_bin_nonfull_run_tryget(arena_bin_t *bin);
-static arena_run_t *arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin);
-static void	*arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
-static void	arena_dissociate_bin_run(arena_chunk_t *chunk, arena_run_t *run,
-    arena_bin_t *bin);
 static void	arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, arena_bin_t *bin);
 static void	arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, arena_bin_t *bin);
-static void	arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk,
-    void *ptr, size_t oldsize, size_t size);
-static bool	arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk,
-    void *ptr, size_t oldsize, size_t size, size_t extra, bool zero);
-static bool	arena_ralloc_large(void *ptr, size_t oldsize, size_t size,
-    size_t extra, bool zero);
-static size_t	bin_info_run_size_calc(arena_bin_info_t *bin_info,
-    size_t min_run_size);
-static void	bin_info_init(void);
 
 /******************************************************************************/
 
@@ -388,50 +354,44 @@ arena_run_page_validate_zeroed(arena_chunk_t *chunk, size_t run_ind)
 }
 
 static void
-arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
-    size_t binind, bool zero)
+arena_cactive_update(arena_t *arena, size_t add_pages, size_t sub_pages)
 {
-	arena_chunk_t *chunk;
-	size_t run_ind, total_pages, need_pages, rem_pages, i;
-	size_t flag_dirty;
 
-	assert((large && binind == BININD_INVALID) || (large == false && binind
-	    != BININD_INVALID));
+	if (config_stats) {
+		ssize_t cactive_diff = CHUNK_CEILING((arena->nactive +
+		    add_pages) << LG_PAGE) - CHUNK_CEILING((arena->nactive -
+		    sub_pages) << LG_PAGE);
+		if (cactive_diff != 0)
+			stats_cactive_add(cactive_diff);
+	}
+}
+
+static void
+arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
+    size_t flag_dirty, size_t need_pages)
+{
+	size_t total_pages, rem_pages;
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
-	run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE);
-	flag_dirty = arena_mapbits_dirty_get(chunk, run_ind);
 	total_pages = arena_mapbits_unallocated_size_get(chunk, run_ind) >>
 	    LG_PAGE;
 	assert(arena_mapbits_dirty_get(chunk, run_ind+total_pages-1) ==
 	    flag_dirty);
-	need_pages = (size >> LG_PAGE);
-	assert(need_pages > 0);
 	assert(need_pages <= total_pages);
 	rem_pages = total_pages - need_pages;
 
 	arena_avail_remove(arena, chunk, run_ind, total_pages, true, true);
-	if (config_stats) {
-		/*
-		 * Update stats_cactive if nactive is crossing a chunk
-		 * multiple.
-		 */
-		size_t cactive_diff = CHUNK_CEILING((arena->nactive +
-		    need_pages) << LG_PAGE) - CHUNK_CEILING(arena->nactive <<
-		    LG_PAGE);
-		if (cactive_diff != 0)
-			stats_cactive_add(cactive_diff);
-	}
+	arena_cactive_update(arena, need_pages, 0);
 	arena->nactive += need_pages;
 
 	/* Keep track of trailing unused pages for later use. */
 	if (rem_pages > 0) {
 		if (flag_dirty != 0) {
-			arena_mapbits_unallocated_set(chunk, run_ind+need_pages,
-			    (rem_pages << LG_PAGE), CHUNK_MAP_DIRTY);
+			arena_mapbits_unallocated_set(chunk,
+			    run_ind+need_pages, (rem_pages << LG_PAGE),
+			    flag_dirty);
 			arena_mapbits_unallocated_set(chunk,
 			    run_ind+total_pages-1, (rem_pages << LG_PAGE),
-			    CHUNK_MAP_DIRTY);
+			    flag_dirty);
 		} else {
 			arena_mapbits_unallocated_set(chunk, run_ind+need_pages,
 			    (rem_pages << LG_PAGE),
@@ -445,166 +405,217 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 		arena_avail_insert(arena, chunk, run_ind+need_pages, rem_pages,
 		    false, true);
 	}
+}
 
-	/*
-	 * Update the page map separately for large vs. small runs, since it is
-	 * possible to avoid iteration for large mallocs.
-	 */
-	if (large) {
-		if (zero) {
-			if (flag_dirty == 0) {
-				/*
-				 * The run is clean, so some pages may be
-				 * zeroed (i.e. never before touched).
-				 */
-				for (i = 0; i < need_pages; i++) {
-					if (arena_mapbits_unzeroed_get(chunk,
-					    run_ind+i) != 0) {
-						arena_run_zero(chunk, run_ind+i,
-						    1);
-					} else if (config_debug) {
-						arena_run_page_validate_zeroed(
-						    chunk, run_ind+i);
-					} else {
-						arena_run_page_mark_zeroed(
-						    chunk, run_ind+i);
-					}
+static void
+arena_run_split_large_helper(arena_t *arena, arena_run_t *run, size_t size,
+    bool remove, bool zero)
+{
+	arena_chunk_t *chunk;
+	size_t flag_dirty, run_ind, need_pages, i;
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
+	run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE);
+	flag_dirty = arena_mapbits_dirty_get(chunk, run_ind);
+	need_pages = (size >> LG_PAGE);
+	assert(need_pages > 0);
+
+	if (remove) {
+		arena_run_split_remove(arena, chunk, run_ind, flag_dirty,
+		    need_pages);
+	}
+
+	if (zero) {
+		if (flag_dirty == 0) {
+			/*
+			 * The run is clean, so some pages may be zeroed (i.e.
+			 * never before touched).
+			 */
+			for (i = 0; i < need_pages; i++) {
+				if (arena_mapbits_unzeroed_get(chunk, run_ind+i)
+				    != 0)
+					arena_run_zero(chunk, run_ind+i, 1);
+				else if (config_debug) {
+					arena_run_page_validate_zeroed(chunk,
+					    run_ind+i);
+				} else {
+					arena_run_page_mark_zeroed(chunk,
+					    run_ind+i);
 				}
-			} else {
-				/*
-				 * The run is dirty, so all pages must be
-				 * zeroed.
-				 */
-				arena_run_zero(chunk, run_ind, need_pages);
 			}
 		} else {
-			VALGRIND_MAKE_MEM_UNDEFINED((void *)((uintptr_t)chunk +
-			    (run_ind << LG_PAGE)), (need_pages << LG_PAGE));
+			/* The run is dirty, so all pages must be zeroed. */
+			arena_run_zero(chunk, run_ind, need_pages);
 		}
-
-		/*
-		 * Set the last element first, in case the run only contains one
-		 * page (i.e. both statements set the same element).
-		 */
-		arena_mapbits_large_set(chunk, run_ind+need_pages-1, 0,
-		    flag_dirty);
-		arena_mapbits_large_set(chunk, run_ind, size, flag_dirty);
 	} else {
-		assert(zero == false);
-		/*
-		 * Propagate the dirty and unzeroed flags to the allocated
-		 * small run, so that arena_dalloc_bin_run() has the ability to
-		 * conditionally trim clean pages.
-		 */
-		arena_mapbits_small_set(chunk, run_ind, 0, binind, flag_dirty);
-		/*
-		 * The first page will always be dirtied during small run
-		 * initialization, so a validation failure here would not
-		 * actually cause an observable failure.
-		 */
-		if (config_debug && flag_dirty == 0 &&
-		    arena_mapbits_unzeroed_get(chunk, run_ind) == 0)
-			arena_run_page_validate_zeroed(chunk, run_ind);
-		for (i = 1; i < need_pages - 1; i++) {
-			arena_mapbits_small_set(chunk, run_ind+i, i, binind, 0);
-			if (config_debug && flag_dirty == 0 &&
-			    arena_mapbits_unzeroed_get(chunk, run_ind+i) == 0) {
-				arena_run_page_validate_zeroed(chunk,
-				    run_ind+i);
-			}
-		}
-		arena_mapbits_small_set(chunk, run_ind+need_pages-1,
-		    need_pages-1, binind, flag_dirty);
-		if (config_debug && flag_dirty == 0 &&
-		    arena_mapbits_unzeroed_get(chunk, run_ind+need_pages-1) ==
-		    0) {
-			arena_run_page_validate_zeroed(chunk,
-			    run_ind+need_pages-1);
-		}
 		VALGRIND_MAKE_MEM_UNDEFINED((void *)((uintptr_t)chunk +
 		    (run_ind << LG_PAGE)), (need_pages << LG_PAGE));
 	}
+
+	/*
+	 * Set the last element first, in case the run only contains one page
+	 * (i.e. both statements set the same element).
+	 */
+	arena_mapbits_large_set(chunk, run_ind+need_pages-1, 0, flag_dirty);
+	arena_mapbits_large_set(chunk, run_ind, size, flag_dirty);
+}
+
+static void
+arena_run_split_large(arena_t *arena, arena_run_t *run, size_t size, bool zero)
+{
+
+	arena_run_split_large_helper(arena, run, size, true, zero);
+}
+
+static void
+arena_run_init_large(arena_t *arena, arena_run_t *run, size_t size, bool zero)
+{
+
+	arena_run_split_large_helper(arena, run, size, false, zero);
+}
+
+static void
+arena_run_split_small(arena_t *arena, arena_run_t *run, size_t size,
+    size_t binind)
+{
+	arena_chunk_t *chunk;
+	size_t flag_dirty, run_ind, need_pages, i;
+
+	assert(binind != BININD_INVALID);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
+	run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE);
+	flag_dirty = arena_mapbits_dirty_get(chunk, run_ind);
+	need_pages = (size >> LG_PAGE);
+	assert(need_pages > 0);
+
+	arena_run_split_remove(arena, chunk, run_ind, flag_dirty, need_pages);
+
+	/*
+	 * Propagate the dirty and unzeroed flags to the allocated small run,
+	 * so that arena_dalloc_bin_run() has the ability to conditionally trim
+	 * clean pages.
+	 */
+	arena_mapbits_small_set(chunk, run_ind, 0, binind, flag_dirty);
+	/*
+	 * The first page will always be dirtied during small run
+	 * initialization, so a validation failure here would not actually
+	 * cause an observable failure.
+	 */
+	if (config_debug && flag_dirty == 0 && arena_mapbits_unzeroed_get(chunk,
+	    run_ind) == 0)
+		arena_run_page_validate_zeroed(chunk, run_ind);
+	for (i = 1; i < need_pages - 1; i++) {
+		arena_mapbits_small_set(chunk, run_ind+i, i, binind, 0);
+		if (config_debug && flag_dirty == 0 &&
+		    arena_mapbits_unzeroed_get(chunk, run_ind+i) == 0)
+			arena_run_page_validate_zeroed(chunk, run_ind+i);
+	}
+	arena_mapbits_small_set(chunk, run_ind+need_pages-1, need_pages-1,
+	    binind, flag_dirty);
+	if (config_debug && flag_dirty == 0 && arena_mapbits_unzeroed_get(chunk,
+	    run_ind+need_pages-1) == 0)
+		arena_run_page_validate_zeroed(chunk, run_ind+need_pages-1);
+	VALGRIND_MAKE_MEM_UNDEFINED((void *)((uintptr_t)chunk +
+	    (run_ind << LG_PAGE)), (need_pages << LG_PAGE));
 }
 
 static arena_chunk_t *
-arena_chunk_alloc(arena_t *arena)
+arena_chunk_init_spare(arena_t *arena)
 {
 	arena_chunk_t *chunk;
-	size_t i;
 
-	if (arena->spare != NULL) {
-		chunk = arena->spare;
-		arena->spare = NULL;
+	assert(arena->spare != NULL);
 
-		assert(arena_mapbits_allocated_get(chunk, map_bias) == 0);
-		assert(arena_mapbits_allocated_get(chunk, chunk_npages-1) == 0);
-		assert(arena_mapbits_unallocated_size_get(chunk, map_bias) ==
-		    arena_maxclass);
-		assert(arena_mapbits_unallocated_size_get(chunk,
-		    chunk_npages-1) == arena_maxclass);
-		assert(arena_mapbits_dirty_get(chunk, map_bias) ==
-		    arena_mapbits_dirty_get(chunk, chunk_npages-1));
-	} else {
-		bool zero;
-		size_t unzeroed;
+	chunk = arena->spare;
+	arena->spare = NULL;
 
-		zero = false;
-		malloc_mutex_unlock(&arena->lock);
-		chunk = (arena_chunk_t *)chunk_alloc(chunksize, chunksize,
-		    false, &zero, arena->dss_prec);
-		malloc_mutex_lock(&arena->lock);
-		if (chunk == NULL)
-			return (NULL);
-		if (config_stats)
-			arena->stats.mapped += chunksize;
+	assert(arena_mapbits_allocated_get(chunk, map_bias) == 0);
+	assert(arena_mapbits_allocated_get(chunk, chunk_npages-1) == 0);
+	assert(arena_mapbits_unallocated_size_get(chunk, map_bias) ==
+	    arena_maxclass);
+	assert(arena_mapbits_unallocated_size_get(chunk, chunk_npages-1) ==
+	    arena_maxclass);
+	assert(arena_mapbits_dirty_get(chunk, map_bias) ==
+	    arena_mapbits_dirty_get(chunk, chunk_npages-1));
 
-		chunk->arena = arena;
+	return (chunk);
+}
 
-		/*
-		 * Claim that no pages are in use, since the header is merely
-		 * overhead.
-		 */
-		chunk->ndirty = 0;
+static arena_chunk_t *
+arena_chunk_init_hard(arena_t *arena)
+{
+	arena_chunk_t *chunk;
+	bool zero;
+	size_t unzeroed, i;
 
-		chunk->nruns_avail = 0;
-		chunk->nruns_adjac = 0;
+	assert(arena->spare == NULL);
 
-		/*
-		 * Initialize the map to contain one maximal free untouched run.
-		 * Mark the pages as zeroed iff chunk_alloc() returned a zeroed
-		 * chunk.
-		 */
-		unzeroed = zero ? 0 : CHUNK_MAP_UNZEROED;
-		arena_mapbits_unallocated_set(chunk, map_bias, arena_maxclass,
-		    unzeroed);
-		/*
-		 * There is no need to initialize the internal page map entries
-		 * unless the chunk is not zeroed.
-		 */
-		if (zero == false) {
-			VALGRIND_MAKE_MEM_UNDEFINED(
-			    (void *)arena_mapp_get(chunk, map_bias+1),
-			    (size_t)((uintptr_t) arena_mapp_get(chunk,
-			    chunk_npages-1) - (uintptr_t)arena_mapp_get(chunk,
-			    map_bias+1)));
-			for (i = map_bias+1; i < chunk_npages-1; i++)
-				arena_mapbits_unzeroed_set(chunk, i, unzeroed);
-		} else {
-			VALGRIND_MAKE_MEM_DEFINED(
-			    (void *)arena_mapp_get(chunk, map_bias+1),
-			    (size_t)((uintptr_t) arena_mapp_get(chunk,
-			    chunk_npages-1) - (uintptr_t)arena_mapp_get(chunk,
-			    map_bias+1)));
-			if (config_debug) {
-				for (i = map_bias+1; i < chunk_npages-1; i++) {
-					assert(arena_mapbits_unzeroed_get(chunk,
-					    i) == unzeroed);
-				}
+	zero = false;
+	malloc_mutex_unlock(&arena->lock);
+	chunk = (arena_chunk_t *)chunk_alloc(chunksize, chunksize, false,
+	    &zero, arena->dss_prec);
+	malloc_mutex_lock(&arena->lock);
+	if (chunk == NULL)
+		return (NULL);
+	if (config_stats)
+		arena->stats.mapped += chunksize;
+
+	chunk->arena = arena;
+
+	/*
+	 * Claim that no pages are in use, since the header is merely overhead.
+	 */
+	chunk->ndirty = 0;
+
+	chunk->nruns_avail = 0;
+	chunk->nruns_adjac = 0;
+
+	/*
+	 * Initialize the map to contain one maximal free untouched run.  Mark
+	 * the pages as zeroed iff chunk_alloc() returned a zeroed chunk.
+	 */
+	unzeroed = zero ? 0 : CHUNK_MAP_UNZEROED;
+	arena_mapbits_unallocated_set(chunk, map_bias, arena_maxclass,
+	    unzeroed);
+	/*
+	 * There is no need to initialize the internal page map entries unless
+	 * the chunk is not zeroed.
+	 */
+	if (zero == false) {
+		VALGRIND_MAKE_MEM_UNDEFINED((void *)arena_mapp_get(chunk,
+		    map_bias+1), (size_t)((uintptr_t) arena_mapp_get(chunk,
+		    chunk_npages-1) - (uintptr_t)arena_mapp_get(chunk,
+		    map_bias+1)));
+		for (i = map_bias+1; i < chunk_npages-1; i++)
+			arena_mapbits_unzeroed_set(chunk, i, unzeroed);
+	} else {
+		VALGRIND_MAKE_MEM_DEFINED((void *)arena_mapp_get(chunk,
+		    map_bias+1), (size_t)((uintptr_t) arena_mapp_get(chunk,
+		    chunk_npages-1) - (uintptr_t)arena_mapp_get(chunk,
+		    map_bias+1)));
+		if (config_debug) {
+			for (i = map_bias+1; i < chunk_npages-1; i++) {
+				assert(arena_mapbits_unzeroed_get(chunk, i) ==
+				    unzeroed);
 			}
 		}
-		arena_mapbits_unallocated_set(chunk, chunk_npages-1,
-		    arena_maxclass, unzeroed);
 	}
+	arena_mapbits_unallocated_set(chunk, chunk_npages-1, arena_maxclass,
+	    unzeroed);
+
+	return (chunk);
+}
+
+static arena_chunk_t *
+arena_chunk_alloc(arena_t *arena)
+{
+	arena_chunk_t *chunk;
+
+	if (arena->spare != NULL)
+		chunk = arena_chunk_init_spare(arena);
+	else
+		chunk = arena_chunk_init_hard(arena);
 
 	/* Insert the run into the runs_avail tree. */
 	arena_avail_insert(arena, chunk, map_bias, chunk_npages-map_bias,
@@ -646,8 +657,7 @@ arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk)
 }
 
 static arena_run_t *
-arena_run_alloc_helper(arena_t *arena, size_t size, bool large, size_t binind,
-    bool zero)
+arena_run_alloc_large_helper(arena_t *arena, size_t size, bool zero)
 {
 	arena_run_t *run;
 	arena_chunk_map_t *mapelm, key;
@@ -662,7 +672,7 @@ arena_run_alloc_helper(arena_t *arena, size_t size, bool large, size_t binind,
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    LG_PAGE));
-		arena_run_split(arena, run, size, large, binind, zero);
+		arena_run_split_large(arena, run, size, zero);
 		return (run);
 	}
 
@@ -670,19 +680,16 @@ arena_run_alloc_helper(arena_t *arena, size_t size, bool large, size_t binind,
 }
 
 static arena_run_t *
-arena_run_alloc(arena_t *arena, size_t size, bool large, size_t binind,
-    bool zero)
+arena_run_alloc_large(arena_t *arena, size_t size, bool zero)
 {
 	arena_chunk_t *chunk;
 	arena_run_t *run;
 
 	assert(size <= arena_maxclass);
 	assert((size & PAGE_MASK) == 0);
-	assert((large && binind == BININD_INVALID) || (large == false && binind
-	    != BININD_INVALID));
 
 	/* Search the arena's chunks for the lowest best fit. */
-	run = arena_run_alloc_helper(arena, size, large, binind, zero);
+	run = arena_run_alloc_large_helper(arena, size, zero);
 	if (run != NULL)
 		return (run);
 
@@ -692,7 +699,7 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, size_t binind,
 	chunk = arena_chunk_alloc(arena);
 	if (chunk != NULL) {
 		run = (arena_run_t *)((uintptr_t)chunk + (map_bias << LG_PAGE));
-		arena_run_split(arena, run, size, large, binind, zero);
+		arena_run_split_large(arena, run, size, zero);
 		return (run);
 	}
 
@@ -701,7 +708,63 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, size_t binind,
 	 * sufficient memory available while this one dropped arena->lock in
 	 * arena_chunk_alloc(), so search one more time.
 	 */
-	return (arena_run_alloc_helper(arena, size, large, binind, zero));
+	return (arena_run_alloc_large_helper(arena, size, zero));
+}
+
+static arena_run_t *
+arena_run_alloc_small_helper(arena_t *arena, size_t size, size_t binind)
+{
+	arena_run_t *run;
+	arena_chunk_map_t *mapelm, key;
+
+	key.bits = size | CHUNK_MAP_KEY;
+	mapelm = arena_avail_tree_nsearch(&arena->runs_avail, &key);
+	if (mapelm != NULL) {
+		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
+		size_t pageind = (((uintptr_t)mapelm -
+		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
+		    + map_bias;
+
+		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
+		    LG_PAGE));
+		arena_run_split_small(arena, run, size, binind);
+		return (run);
+	}
+
+	return (NULL);
+}
+
+static arena_run_t *
+arena_run_alloc_small(arena_t *arena, size_t size, size_t binind)
+{
+	arena_chunk_t *chunk;
+	arena_run_t *run;
+
+	assert(size <= arena_maxclass);
+	assert((size & PAGE_MASK) == 0);
+	assert(binind != BININD_INVALID);
+
+	/* Search the arena's chunks for the lowest best fit. */
+	run = arena_run_alloc_small_helper(arena, size, binind);
+	if (run != NULL)
+		return (run);
+
+	/*
+	 * No usable runs.  Create a new chunk from which to allocate the run.
+	 */
+	chunk = arena_chunk_alloc(arena);
+	if (chunk != NULL) {
+		run = (arena_run_t *)((uintptr_t)chunk + (map_bias << LG_PAGE));
+		arena_run_split_small(arena, run, size, binind);
+		return (run);
+	}
+
+	/*
+	 * arena_chunk_alloc() failed, but another thread may have made
+	 * sufficient memory available while this one dropped arena->lock in
+	 * arena_chunk_alloc(), so search one more time.
+	 */
+	return (arena_run_alloc_small_helper(arena, size, binind));
 }
 
 static inline void
@@ -727,48 +790,42 @@ arena_maybe_purge(arena_t *arena)
 	arena_purge(arena, false);
 }
 
-static inline size_t
-arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk, bool all)
+static arena_chunk_t *
+chunks_dirty_iter_cb(arena_chunk_tree_t *tree, arena_chunk_t *chunk, void *arg)
 {
-	size_t npurged;
-	ql_head(arena_chunk_map_t) mapelms;
-	arena_chunk_map_t *mapelm;
-	size_t pageind, npages;
-	size_t nmadvise;
+       size_t *ndirty = (size_t *)arg;
 
-	ql_new(&mapelms);
+       assert(chunk->ndirty != 0);
+       *ndirty += chunk->ndirty;
+       return (NULL);
+}
+
+static size_t
+arena_compute_npurgatory(arena_t *arena, bool all)
+{
+	size_t npurgatory, npurgeable;
 
 	/*
-	 * If chunk is the spare, temporarily re-allocate it, 1) so that its
-	 * run is reinserted into runs_avail, and 2) so that it cannot be
-	 * completely discarded by another thread while arena->lock is dropped
-	 * by this thread.  Note that the arena_run_dalloc() call will
-	 * implicitly deallocate the chunk, so no explicit action is required
-	 * in this function to deallocate the chunk.
-	 *
-	 * Note that once a chunk contains dirty pages, it cannot again contain
-	 * a single run unless 1) it is a dirty run, or 2) this function purges
-	 * dirty pages and causes the transition to a single clean run.  Thus
-	 * (chunk == arena->spare) is possible, but it is not possible for
-	 * this function to be called on the spare unless it contains a dirty
-	 * run.
+	 * Compute the minimum number of pages that this thread should try to
+	 * purge.
 	 */
-	if (chunk == arena->spare) {
-		assert(arena_mapbits_dirty_get(chunk, map_bias) != 0);
-		assert(arena_mapbits_dirty_get(chunk, chunk_npages-1) != 0);
+	npurgeable = arena->ndirty - arena->npurgatory;
 
-		arena_chunk_alloc(arena);
-	}
+	if (all == false) {
+		size_t threshold = (arena->nactive >> opt_lg_dirty_mult);
 
-	if (config_stats)
-		arena->stats.purged += chunk->ndirty;
+		npurgatory = npurgeable - threshold;
+	} else
+		npurgatory = npurgeable;
 
-	/*
-	 * Operate on all dirty runs if there is no clean/dirty run
-	 * fragmentation.
-	 */
-	if (chunk->nruns_adjac == 0)
-		all = true;
+	return (npurgatory);
+}
+
+static void
+arena_chunk_stash_dirty(arena_t *arena, arena_chunk_t *chunk, bool all,
+    arena_chunk_mapelms_t *mapelms)
+{
+	size_t pageind, npages;
 
 	/*
 	 * Temporarily allocate free dirty runs within chunk.  If all is false,
@@ -776,7 +833,7 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk, bool all)
 	 * all dirty runs.
 	 */
 	for (pageind = map_bias; pageind < chunk_npages; pageind += npages) {
-		mapelm = arena_mapp_get(chunk, pageind);
+		arena_chunk_map_t *mapelm = arena_mapp_get(chunk, pageind);
 		if (arena_mapbits_allocated_get(chunk, pageind) == 0) {
 			size_t run_size =
 			    arena_mapbits_unallocated_size_get(chunk, pageind);
@@ -792,11 +849,11 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk, bool all)
 				arena_run_t *run = (arena_run_t *)((uintptr_t)
 				    chunk + (uintptr_t)(pageind << LG_PAGE));
 
-				arena_run_split(arena, run, run_size, true,
-				    BININD_INVALID, false);
+				arena_run_split_large(arena, run, run_size,
+				    false);
 				/* Append to list for later processing. */
 				ql_elm_new(mapelm, u.ql_link);
-				ql_tail_insert(&mapelms, mapelm, u.ql_link);
+				ql_tail_insert(mapelms, mapelm, u.ql_link);
 			}
 		} else {
 			/* Skip run. */
@@ -820,12 +877,20 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk, bool all)
 	assert(pageind == chunk_npages);
 	assert(chunk->ndirty == 0 || all == false);
 	assert(chunk->nruns_adjac == 0);
+}
+
+static size_t
+arena_chunk_purge_stashed(arena_t *arena, arena_chunk_t *chunk,
+    arena_chunk_mapelms_t *mapelms)
+{
+	size_t npurged, pageind, npages, nmadvise;
+	arena_chunk_map_t *mapelm;
 
 	malloc_mutex_unlock(&arena->lock);
 	if (config_stats)
 		nmadvise = 0;
 	npurged = 0;
-	ql_foreach(mapelm, &mapelms, u.ql_link) {
+	ql_foreach(mapelm, mapelms, u.ql_link) {
 		bool unzeroed;
 		size_t flag_unzeroed, i;
 
@@ -859,30 +924,75 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk, bool all)
 	if (config_stats)
 		arena->stats.nmadvise += nmadvise;
 
+	return (npurged);
+}
+
+static void
+arena_chunk_unstash_purged(arena_t *arena, arena_chunk_t *chunk,
+    arena_chunk_mapelms_t *mapelms)
+{
+	arena_chunk_map_t *mapelm;
+	size_t pageind;
+
 	/* Deallocate runs. */
-	for (mapelm = ql_first(&mapelms); mapelm != NULL;
-	    mapelm = ql_first(&mapelms)) {
+	for (mapelm = ql_first(mapelms); mapelm != NULL;
+	    mapelm = ql_first(mapelms)) {
 		arena_run_t *run;
 
 		pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
 		    sizeof(arena_chunk_map_t)) + map_bias;
 		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)(pageind <<
 		    LG_PAGE));
-		ql_remove(&mapelms, mapelm, u.ql_link);
+		ql_remove(mapelms, mapelm, u.ql_link);
 		arena_run_dalloc(arena, run, false, true);
 	}
-
-	return (npurged);
 }
 
-static arena_chunk_t *
-chunks_dirty_iter_cb(arena_chunk_tree_t *tree, arena_chunk_t *chunk, void *arg)
+static inline size_t
+arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk, bool all)
 {
-       size_t *ndirty = (size_t *)arg;
+	size_t npurged;
+	arena_chunk_mapelms_t mapelms;
 
-       assert(chunk->ndirty != 0);
-       *ndirty += chunk->ndirty;
-       return (NULL);
+	ql_new(&mapelms);
+
+	/*
+	 * If chunk is the spare, temporarily re-allocate it, 1) so that its
+	 * run is reinserted into runs_avail, and 2) so that it cannot be
+	 * completely discarded by another thread while arena->lock is dropped
+	 * by this thread.  Note that the arena_run_dalloc() call will
+	 * implicitly deallocate the chunk, so no explicit action is required
+	 * in this function to deallocate the chunk.
+	 *
+	 * Note that once a chunk contains dirty pages, it cannot again contain
+	 * a single run unless 1) it is a dirty run, or 2) this function purges
+	 * dirty pages and causes the transition to a single clean run.  Thus
+	 * (chunk == arena->spare) is possible, but it is not possible for
+	 * this function to be called on the spare unless it contains a dirty
+	 * run.
+	 */
+	if (chunk == arena->spare) {
+		assert(arena_mapbits_dirty_get(chunk, map_bias) != 0);
+		assert(arena_mapbits_dirty_get(chunk, chunk_npages-1) != 0);
+
+		arena_chunk_alloc(arena);
+	}
+
+	if (config_stats)
+		arena->stats.purged += chunk->ndirty;
+
+	/*
+	 * Operate on all dirty runs if there is no clean/dirty run
+	 * fragmentation.
+	 */
+	if (chunk->nruns_adjac == 0)
+		all = true;
+
+	arena_chunk_stash_dirty(arena, chunk, all, &mapelms);
+	npurged = arena_chunk_purge_stashed(arena, chunk, &mapelms);
+	arena_chunk_unstash_purged(arena, chunk, &mapelms);
+
+	return (npurged);
 }
 
 static void
@@ -905,21 +1015,11 @@ arena_purge(arena_t *arena, bool all)
 		arena->stats.npurge++;
 
 	/*
-	 * Compute the minimum number of pages that this thread should try to
-	 * purge, and add the result to arena->npurgatory.  This will keep
-	 * multiple threads from racing to reduce ndirty below the threshold.
+	 * Add the minimum number of pages this thread should try to purge to
+	 * arena->npurgatory.  This will keep multiple threads from racing to
+	 * reduce ndirty below the threshold.
 	 */
-	{
-		size_t npurgeable = arena->ndirty - arena->npurgatory;
-
-		if (all == false) {
-			size_t threshold = (arena->nactive >>
-			    opt_lg_dirty_mult);
-
-			npurgatory = npurgeable - threshold;
-		} else
-			npurgatory = npurgeable;
-	}
+	npurgatory = arena_compute_npurgatory(arena, all);
 	arena->npurgatory += npurgatory;
 
 	while (npurgatory > 0) {
@@ -986,61 +1086,12 @@ arena_purge_all(arena_t *arena)
 }
 
 static void
-arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
+arena_run_coalesce(arena_t *arena, arena_chunk_t *chunk, size_t *p_size,
+    size_t *p_run_ind, size_t *p_run_pages, size_t flag_dirty)
 {
-	arena_chunk_t *chunk;
-	size_t size, run_ind, run_pages, flag_dirty;
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
-	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE);
-	assert(run_ind >= map_bias);
-	assert(run_ind < chunk_npages);
-	if (arena_mapbits_large_get(chunk, run_ind) != 0) {
-		size = arena_mapbits_large_size_get(chunk, run_ind);
-		assert(size == PAGE ||
-		    arena_mapbits_large_size_get(chunk,
-		    run_ind+(size>>LG_PAGE)-1) == 0);
-	} else {
-		size_t binind = arena_bin_index(arena, run->bin);
-		arena_bin_info_t *bin_info = &arena_bin_info[binind];
-		size = bin_info->run_size;
-	}
-	run_pages = (size >> LG_PAGE);
-	if (config_stats) {
-		/*
-		 * Update stats_cactive if nactive is crossing a chunk
-		 * multiple.
-		 */
-		size_t cactive_diff = CHUNK_CEILING(arena->nactive << LG_PAGE) -
-		    CHUNK_CEILING((arena->nactive - run_pages) << LG_PAGE);
-		if (cactive_diff != 0)
-			stats_cactive_sub(cactive_diff);
-	}
-	arena->nactive -= run_pages;
-
-	/*
-	 * The run is dirty if the caller claims to have dirtied it, as well as
-	 * if it was already dirty before being allocated and the caller
-	 * doesn't claim to have cleaned it.
-	 */
-	assert(arena_mapbits_dirty_get(chunk, run_ind) ==
-	    arena_mapbits_dirty_get(chunk, run_ind+run_pages-1));
-	if (cleaned == false && arena_mapbits_dirty_get(chunk, run_ind) != 0)
-		dirty = true;
-	flag_dirty = dirty ? CHUNK_MAP_DIRTY : 0;
-
-	/* Mark pages as unallocated in the chunk map. */
-	if (dirty) {
-		arena_mapbits_unallocated_set(chunk, run_ind, size,
-		    CHUNK_MAP_DIRTY);
-		arena_mapbits_unallocated_set(chunk, run_ind+run_pages-1, size,
-		    CHUNK_MAP_DIRTY);
-	} else {
-		arena_mapbits_unallocated_set(chunk, run_ind, size,
-		    arena_mapbits_unzeroed_get(chunk, run_ind));
-		arena_mapbits_unallocated_set(chunk, run_ind+run_pages-1, size,
-		    arena_mapbits_unzeroed_get(chunk, run_ind+run_pages-1));
-	}
+	size_t size = *p_size;
+	size_t run_ind = *p_run_ind;
+	size_t run_pages = *p_run_pages;
 
 	/* Try to coalesce forward. */
 	if (run_ind + run_pages < chunk_npages &&
@@ -1070,8 +1121,9 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 	}
 
 	/* Try to coalesce backward. */
-	if (run_ind > map_bias && arena_mapbits_allocated_get(chunk, run_ind-1)
-	    == 0 && arena_mapbits_dirty_get(chunk, run_ind-1) == flag_dirty) {
+	if (run_ind > map_bias && arena_mapbits_allocated_get(chunk,
+	    run_ind-1) == 0 && arena_mapbits_dirty_get(chunk, run_ind-1) ==
+	    flag_dirty) {
 		size_t prun_size = arena_mapbits_unallocated_size_get(chunk,
 		    run_ind-1);
 		size_t prun_pages = prun_size >> LG_PAGE;
@@ -1096,6 +1148,62 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 		    size);
 	}
 
+	*p_size = size;
+	*p_run_ind = run_ind;
+	*p_run_pages = run_pages;
+}
+
+static void
+arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
+{
+	arena_chunk_t *chunk;
+	size_t size, run_ind, run_pages, flag_dirty;
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
+	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE);
+	assert(run_ind >= map_bias);
+	assert(run_ind < chunk_npages);
+	if (arena_mapbits_large_get(chunk, run_ind) != 0) {
+		size = arena_mapbits_large_size_get(chunk, run_ind);
+		assert(size == PAGE ||
+		    arena_mapbits_large_size_get(chunk,
+		    run_ind+(size>>LG_PAGE)-1) == 0);
+	} else {
+		size_t binind = arena_bin_index(arena, run->bin);
+		arena_bin_info_t *bin_info = &arena_bin_info[binind];
+		size = bin_info->run_size;
+	}
+	run_pages = (size >> LG_PAGE);
+	arena_cactive_update(arena, 0, run_pages);
+	arena->nactive -= run_pages;
+
+	/*
+	 * The run is dirty if the caller claims to have dirtied it, as well as
+	 * if it was already dirty before being allocated and the caller
+	 * doesn't claim to have cleaned it.
+	 */
+	assert(arena_mapbits_dirty_get(chunk, run_ind) ==
+	    arena_mapbits_dirty_get(chunk, run_ind+run_pages-1));
+	if (cleaned == false && arena_mapbits_dirty_get(chunk, run_ind) != 0)
+		dirty = true;
+	flag_dirty = dirty ? CHUNK_MAP_DIRTY : 0;
+
+	/* Mark pages as unallocated in the chunk map. */
+	if (dirty) {
+		arena_mapbits_unallocated_set(chunk, run_ind, size,
+		    CHUNK_MAP_DIRTY);
+		arena_mapbits_unallocated_set(chunk, run_ind+run_pages-1, size,
+		    CHUNK_MAP_DIRTY);
+	} else {
+		arena_mapbits_unallocated_set(chunk, run_ind, size,
+		    arena_mapbits_unzeroed_get(chunk, run_ind));
+		arena_mapbits_unallocated_set(chunk, run_ind+run_pages-1, size,
+		    arena_mapbits_unzeroed_get(chunk, run_ind+run_pages-1));
+	}
+
+	arena_run_coalesce(arena, chunk, &size, &run_ind, &run_pages,
+	    flag_dirty);
+
 	/* Insert into runs_avail, now that coalescing is complete. */
 	assert(arena_mapbits_unallocated_size_get(chunk, run_ind) ==
 	    arena_mapbits_unallocated_size_get(chunk, run_ind+run_pages-1));
@@ -1263,7 +1371,7 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 	malloc_mutex_unlock(&bin->lock);
 	/******************************/
 	malloc_mutex_lock(&arena->lock);
-	run = arena_run_alloc(arena, bin_info->run_size, false, binind, false);
+	run = arena_run_alloc_small(arena, bin_info->run_size, binind);
 	if (run != NULL) {
 		bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run +
 		    (uintptr_t)bin_info->bitmap_offset);
@@ -1286,7 +1394,7 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 	}
 
 	/*
-	 * arena_run_alloc() failed, but another thread may have made
+	 * arena_run_alloc_small() failed, but another thread may have made
 	 * sufficient memory available while this one dropped bin->lock above,
 	 * so search one more time.
 	 */
@@ -1321,12 +1429,12 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 			arena_chunk_t *chunk;
 
 			/*
-			 * arena_run_alloc() may have allocated run, or it may
-			 * have pulled run from the bin's run tree.  Therefore
-			 * it is unsafe to make any assumptions about how run
-			 * has previously been used, and arena_bin_lower_run()
-			 * must be called, as if a region were just deallocated
-			 * from the run.
+			 * arena_run_alloc_small() may have allocated run, or
+			 * it may have pulled run from the bin's run tree.
+			 * Therefore it is unsafe to make any assumptions about
+			 * how run has previously been used, and
+			 * arena_bin_lower_run() must be called, as if a region
+			 * were just deallocated from the run.
 			 */
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 			if (run->nfree == bin_info->nregs)
@@ -1404,8 +1512,28 @@ arena_alloc_junk_small(void *ptr, arena_bin_info_t *bin_info, bool zero)
 	}
 }
 
-void
-arena_dalloc_junk_small(void *ptr, arena_bin_info_t *bin_info)
+#ifdef JEMALLOC_JET
+#undef arena_redzone_corruption
+#define	arena_redzone_corruption JEMALLOC_N(arena_redzone_corruption_impl)
+#endif
+static void
+arena_redzone_corruption(void *ptr, size_t usize, bool after,
+    size_t offset, uint8_t byte)
+{
+
+	malloc_printf("<jemalloc>: Corrupt redzone %zu byte%s %s %p "
+	    "(size %zu), byte=%#x\n", offset, (offset == 1) ? "" : "s",
+	    after ? "after" : "before", ptr, usize, byte);
+}
+#ifdef JEMALLOC_JET
+#undef arena_redzone_corruption
+#define	arena_redzone_corruption JEMALLOC_N(arena_redzone_corruption)
+arena_redzone_corruption_t *arena_redzone_corruption =
+    JEMALLOC_N(arena_redzone_corruption_impl);
+#endif
+
+static void
+arena_redzones_validate(void *ptr, arena_bin_info_t *bin_info, bool reset)
 {
 	size_t size = bin_info->reg_size;
 	size_t redzone_size = bin_info->redzone_size;
@@ -1413,29 +1541,61 @@ arena_dalloc_junk_small(void *ptr, arena_bin_info_t *bin_info)
 	bool error = false;
 
 	for (i = 1; i <= redzone_size; i++) {
-		unsigned byte;
-		if ((byte = *(uint8_t *)((uintptr_t)ptr - i)) != 0xa5) {
+		uint8_t *byte = (uint8_t *)((uintptr_t)ptr - i);
+		if (*byte != 0xa5) {
 			error = true;
-			malloc_printf("<jemalloc>: Corrupt redzone "
-			    "%zu byte%s before %p (size %zu), byte=%#x\n", i,
-			    (i == 1) ? "" : "s", ptr, size, byte);
+			arena_redzone_corruption(ptr, size, false, i, *byte);
+			if (reset)
+				*byte = 0xa5;
 		}
 	}
 	for (i = 0; i < redzone_size; i++) {
-		unsigned byte;
-		if ((byte = *(uint8_t *)((uintptr_t)ptr + size + i)) != 0xa5) {
+		uint8_t *byte = (uint8_t *)((uintptr_t)ptr + size + i);
+		if (*byte != 0xa5) {
 			error = true;
-			malloc_printf("<jemalloc>: Corrupt redzone "
-			    "%zu byte%s after end of %p (size %zu), byte=%#x\n",
-			    i, (i == 1) ? "" : "s", ptr, size, byte);
+			arena_redzone_corruption(ptr, size, true, i, *byte);
+			if (reset)
+				*byte = 0xa5;
 		}
 	}
 	if (opt_abort && error)
 		abort();
+}
 
+#ifdef JEMALLOC_JET
+#undef arena_dalloc_junk_small
+#define	arena_dalloc_junk_small JEMALLOC_N(arena_dalloc_junk_small_impl)
+#endif
+void
+arena_dalloc_junk_small(void *ptr, arena_bin_info_t *bin_info)
+{
+	size_t redzone_size = bin_info->redzone_size;
+
+	arena_redzones_validate(ptr, bin_info, false);
 	memset((void *)((uintptr_t)ptr - redzone_size), 0x5a,
 	    bin_info->reg_interval);
 }
+#ifdef JEMALLOC_JET
+#undef arena_dalloc_junk_small
+#define	arena_dalloc_junk_small JEMALLOC_N(arena_dalloc_junk_small)
+arena_dalloc_junk_small_t *arena_dalloc_junk_small =
+    JEMALLOC_N(arena_dalloc_junk_small_impl);
+#endif
+
+void
+arena_quarantine_junk_small(void *ptr, size_t usize)
+{
+	size_t binind;
+	arena_bin_info_t *bin_info;
+	cassert(config_fill);
+	assert(opt_junk);
+	assert(opt_quarantine);
+	assert(usize <= SMALL_MAXCLASS);
+
+	binind = SMALL_SIZE2BIN(usize);
+	bin_info = &arena_bin_info[binind];
+	arena_redzones_validate(ptr, bin_info, true);
+}
 
 void *
 arena_malloc_small(arena_t *arena, size_t size, bool zero)
@@ -1500,7 +1660,7 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 	/* Large allocation. */
 	size = PAGE_CEILING(size);
 	malloc_mutex_lock(&arena->lock);
-	ret = (void *)arena_run_alloc(arena, size, true, BININD_INVALID, zero);
+	ret = (void *)arena_run_alloc_large(arena, size, zero);
 	if (ret == NULL) {
 		malloc_mutex_unlock(&arena->lock);
 		return (NULL);
@@ -1546,7 +1706,7 @@ arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
 	alloc_size = size + alignment - PAGE;
 
 	malloc_mutex_lock(&arena->lock);
-	run = arena_run_alloc(arena, alloc_size, true, BININD_INVALID, zero);
+	run = arena_run_alloc_large(arena, alloc_size, false);
 	if (run == NULL) {
 		malloc_mutex_unlock(&arena->lock);
 		return (NULL);
@@ -1566,6 +1726,7 @@ arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
 		arena_run_trim_tail(arena, chunk, ret, size + trailsize, size,
 		    false);
 	}
+	arena_run_init_large(arena, (arena_run_t *)ret, size, zero);
 
 	if (config_stats) {
 		arena->stats.nmalloc_large++;
@@ -1769,21 +1930,38 @@ arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	arena_dalloc_bin(arena, chunk, ptr, pageind, mapelm);
 }
 
+#ifdef JEMALLOC_JET
+#undef arena_dalloc_junk_large
+#define	arena_dalloc_junk_large JEMALLOC_N(arena_dalloc_junk_large_impl)
+#endif
+static void
+arena_dalloc_junk_large(void *ptr, size_t usize)
+{
+
+	if (config_fill && opt_junk)
+		memset(ptr, 0x5a, usize);
+}
+#ifdef JEMALLOC_JET
+#undef arena_dalloc_junk_large
+#define	arena_dalloc_junk_large JEMALLOC_N(arena_dalloc_junk_large)
+arena_dalloc_junk_large_t *arena_dalloc_junk_large =
+    JEMALLOC_N(arena_dalloc_junk_large_impl);
+#endif
+
 void
 arena_dalloc_large_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 {
 
 	if (config_fill || config_stats) {
 		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-		size_t size = arena_mapbits_large_size_get(chunk, pageind);
+		size_t usize = arena_mapbits_large_size_get(chunk, pageind);
 
-		if (config_fill && config_stats && opt_junk)
-			memset(ptr, 0x5a, size);
+		arena_dalloc_junk_large(ptr, usize);
 		if (config_stats) {
 			arena->stats.ndalloc_large++;
-			arena->stats.allocated_large -= size;
-			arena->stats.lstats[(size >> LG_PAGE) - 1].ndalloc++;
-			arena->stats.lstats[(size >> LG_PAGE) - 1].curruns--;
+			arena->stats.allocated_large -= usize;
+			arena->stats.lstats[(usize >> LG_PAGE) - 1].ndalloc++;
+			arena->stats.lstats[(usize >> LG_PAGE) - 1].curruns--;
 		}
 	}
 
@@ -1854,9 +2032,8 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		size_t flag_dirty;
 		size_t splitsize = (oldsize + followsize <= size + extra)
 		    ? followsize : size + extra - oldsize;
-		arena_run_split(arena, (arena_run_t *)((uintptr_t)chunk +
-		    ((pageind+npages) << LG_PAGE)), splitsize, true,
-		    BININD_INVALID, zero);
+		arena_run_split_large(arena, (arena_run_t *)((uintptr_t)chunk +
+		    ((pageind+npages) << LG_PAGE)), splitsize, zero);
 
 		size = oldsize + splitsize;
 		npages = size >> LG_PAGE;
@@ -1895,6 +2072,26 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	return (true);
 }
 
+#ifdef JEMALLOC_JET
+#undef arena_ralloc_junk_large
+#define	arena_ralloc_junk_large JEMALLOC_N(arena_ralloc_junk_large_impl)
+#endif
+static void
+arena_ralloc_junk_large(void *ptr, size_t old_usize, size_t usize)
+{
+
+	if (config_fill && opt_junk) {
+		memset((void *)((uintptr_t)ptr + usize), 0x5a,
+		    old_usize - usize);
+	}
+}
+#ifdef JEMALLOC_JET
+#undef arena_ralloc_junk_large
+#define	arena_ralloc_junk_large JEMALLOC_N(arena_ralloc_junk_large)
+arena_ralloc_junk_large_t *arena_ralloc_junk_large =
+    JEMALLOC_N(arena_ralloc_junk_large_impl);
+#endif
+
 /*
  * Try to resize a large allocation, in order to avoid copying.  This will
  * always fail if growing an object, and the following run is already in use.
@@ -1908,10 +2105,6 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
 	psize = PAGE_CEILING(size + extra);
 	if (psize == oldsize) {
 		/* Same size class. */
-		if (config_fill && opt_junk && size < oldsize) {
-			memset((void *)((uintptr_t)ptr + size), 0x5a, oldsize -
-			    size);
-		}
 		return (false);
 	} else {
 		arena_chunk_t *chunk;
@@ -1922,10 +2115,7 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
 
 		if (psize < oldsize) {
 			/* Fill before shrinking in order avoid a race. */
-			if (config_fill && opt_junk) {
-				memset((void *)((uintptr_t)ptr + size), 0x5a,
-				    oldsize - size);
-			}
+			arena_ralloc_junk_large(ptr, oldsize, psize);
 			arena_ralloc_large_shrink(arena, chunk, ptr, oldsize,
 			    psize);
 			return (false);
@@ -1933,17 +2123,23 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
 			bool ret = arena_ralloc_large_grow(arena, chunk, ptr,
 			    oldsize, PAGE_CEILING(size),
 			    psize - PAGE_CEILING(size), zero);
-			if (config_fill && ret == false && zero == false &&
-			    opt_zero) {
-				memset((void *)((uintptr_t)ptr + oldsize), 0,
-				    size - oldsize);
+			if (config_fill && ret == false && zero == false) {
+				if (opt_junk) {
+					memset((void *)((uintptr_t)ptr +
+					    oldsize), 0xa5, isalloc(ptr,
+					    config_prof) - oldsize);
+				} else if (opt_zero) {
+					memset((void *)((uintptr_t)ptr +
+					    oldsize), 0, isalloc(ptr,
+					    config_prof) - oldsize);
+				}
 			}
 			return (ret);
 		}
 	}
 }
 
-void *
+bool
 arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
     bool zero)
 {
@@ -1958,25 +2154,20 @@ arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 			if ((size + extra <= SMALL_MAXCLASS &&
 			    SMALL_SIZE2BIN(size + extra) ==
 			    SMALL_SIZE2BIN(oldsize)) || (size <= oldsize &&
-			    size + extra >= oldsize)) {
-				if (config_fill && opt_junk && size < oldsize) {
-					memset((void *)((uintptr_t)ptr + size),
-					    0x5a, oldsize - size);
-				}
-				return (ptr);
-			}
+			    size + extra >= oldsize))
+				return (false);
 		} else {
 			assert(size <= arena_maxclass);
 			if (size + extra > SMALL_MAXCLASS) {
 				if (arena_ralloc_large(ptr, oldsize, size,
 				    extra, zero) == false)
-					return (ptr);
+					return (false);
 			}
 		}
 	}
 
 	/* Reallocation would require a move. */
-	return (NULL);
+	return (true);
 }
 
 void *
@@ -1988,9 +2179,8 @@ arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	size_t copysize;
 
 	/* Try to avoid moving the allocation. */
-	ret = arena_ralloc_no_move(ptr, oldsize, size, extra, zero);
-	if (ret != NULL)
-		return (ret);
+	if (arena_ralloc_no_move(ptr, oldsize, size, extra, zero) == false)
+		return (ptr);
 
 	/*
 	 * size and oldsize are different enough that we need to move the
@@ -2001,7 +2191,7 @@ arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 		size_t usize = sa2u(size + extra, alignment);
 		if (usize == 0)
 			return (NULL);
-		ret = ipallocx(usize, alignment, zero, try_tcache_alloc, arena);
+		ret = ipalloct(usize, alignment, zero, try_tcache_alloc, arena);
 	} else
 		ret = arena_malloc(arena, size + extra, zero, try_tcache_alloc);
 
@@ -2013,7 +2203,7 @@ arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 			size_t usize = sa2u(size, alignment);
 			if (usize == 0)
 				return (NULL);
-			ret = ipallocx(usize, alignment, zero, try_tcache_alloc,
+			ret = ipalloct(usize, alignment, zero, try_tcache_alloc,
 			    arena);
 		} else
 			ret = arena_malloc(arena, size, zero, try_tcache_alloc);
@@ -2031,7 +2221,7 @@ arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	copysize = (size < oldsize) ? size : oldsize;
 	VALGRIND_MAKE_MEM_UNDEFINED(ret, copysize);
 	memcpy(ret, ptr, copysize);
-	iqallocx(ptr, try_tcache_dalloc);
+	iqalloct(ptr, try_tcache_dalloc);
 	return (ret);
 }
 
diff --git a/src/bitmap.c b/src/bitmap.c
index b47e2629..e2bd907d 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -1,4 +1,4 @@
-#define JEMALLOC_BITMAP_C_
+#define	JEMALLOC_BITMAP_C_
 #include "jemalloc/internal/jemalloc_internal.h"
 
 /******************************************************************************/
diff --git a/src/chunk.c b/src/chunk.c
index b17f43f0..90ab116a 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -180,7 +180,7 @@ chunk_alloc(size_t size, size_t alignment, bool base, bool *zero,
 label_return:
 	if (ret != NULL) {
 		if (config_ivsalloc && base == false) {
-			if (rtree_set(chunks_rtree, (uintptr_t)ret, ret)) {
+			if (rtree_set(chunks_rtree, (uintptr_t)ret, 1)) {
 				chunk_dealloc(ret, size, true);
 				return (NULL);
 			}
@@ -321,7 +321,7 @@ chunk_dealloc(void *chunk, size_t size, bool unmap)
 	assert((size & chunksize_mask) == 0);
 
 	if (config_ivsalloc)
-		rtree_set(chunks_rtree, (uintptr_t)chunk, NULL);
+		rtree_set(chunks_rtree, (uintptr_t)chunk, 0);
 	if (config_stats || config_prof) {
 		malloc_mutex_lock(&chunks_mtx);
 		assert(stats_chunks.curchunks >= (size / chunksize));
@@ -356,7 +356,7 @@ chunk_boot(void)
 	extent_tree_ad_new(&chunks_ad_dss);
 	if (config_ivsalloc) {
 		chunks_rtree = rtree_new((ZU(1) << (LG_SIZEOF_PTR+3)) -
-		    opt_lg_chunk);
+		    opt_lg_chunk, base_alloc, NULL);
 		if (chunks_rtree == NULL)
 			return (true);
 	}
@@ -368,7 +368,7 @@ void
 chunk_prefork(void)
 {
 
-	malloc_mutex_lock(&chunks_mtx);
+	malloc_mutex_prefork(&chunks_mtx);
 	if (config_ivsalloc)
 		rtree_prefork(chunks_rtree);
 	chunk_dss_prefork();
diff --git a/src/chunk_dss.c b/src/chunk_dss.c
index 24781cc5..510bb8be 100644
--- a/src/chunk_dss.c
+++ b/src/chunk_dss.c
@@ -28,16 +28,17 @@ static void		*dss_max;
 
 /******************************************************************************/
 
-#ifndef JEMALLOC_HAVE_SBRK
 static void *
-sbrk(intptr_t increment)
+chunk_dss_sbrk(intptr_t increment)
 {
 
+#ifdef JEMALLOC_HAVE_SBRK
+	return (sbrk(increment));
+#else
 	not_implemented();
-
 	return (NULL);
-}
 #endif
+}
 
 dss_prec_t
 chunk_dss_prec_get(void)
@@ -93,7 +94,7 @@ chunk_alloc_dss(size_t size, size_t alignment, bool *zero)
 		 */
 		do {
 			/* Get the current end of the DSS. */
-			dss_max = sbrk(0);
+			dss_max = chunk_dss_sbrk(0);
 			/*
 			 * Calculate how much padding is necessary to
 			 * chunk-align the end of the DSS.
@@ -117,7 +118,7 @@ chunk_alloc_dss(size_t size, size_t alignment, bool *zero)
 				return (NULL);
 			}
 			incr = gap_size + cpad_size + size;
-			dss_prev = sbrk(incr);
+			dss_prev = chunk_dss_sbrk(incr);
 			if (dss_prev == dss_max) {
 				/* Success. */
 				dss_max = dss_next;
@@ -163,7 +164,7 @@ chunk_dss_boot(void)
 
 	if (malloc_mutex_init(&dss_mtx))
 		return (true);
-	dss_base = sbrk(0);
+	dss_base = chunk_dss_sbrk(0);
 	dss_prev = dss_base;
 	dss_max = dss_base;
 
diff --git a/src/chunk_mmap.c b/src/chunk_mmap.c
index 8a42e759..2056d793 100644
--- a/src/chunk_mmap.c
+++ b/src/chunk_mmap.c
@@ -43,7 +43,7 @@ pages_map(void *addr, size_t size)
 		if (munmap(ret, size) == -1) {
 			char buf[BUFERROR_BUF];
 
-			buferror(buf, sizeof(buf));
+			buferror(get_errno(), buf, sizeof(buf));
 			malloc_printf("<jemalloc: Error in munmap(): %s\n",
 			    buf);
 			if (opt_abort)
@@ -69,7 +69,7 @@ pages_unmap(void *addr, size_t size)
 	{
 		char buf[BUFERROR_BUF];
 
-		buferror(buf, sizeof(buf));
+		buferror(get_errno(), buf, sizeof(buf));
 		malloc_printf("<jemalloc>: Error in "
 #ifdef _WIN32
 		              "VirtualFree"
diff --git a/src/ckh.c b/src/ckh.c
index 2f38348b..04c52966 100644
--- a/src/ckh.c
+++ b/src/ckh.c
@@ -49,7 +49,7 @@ static void	ckh_shrink(ckh_t *ckh);
  * Search bucket for key and return the cell number if found; SIZE_T_MAX
  * otherwise.
  */
-JEMALLOC_INLINE size_t
+JEMALLOC_INLINE_C size_t
 ckh_bucket_search(ckh_t *ckh, size_t bucket, const void *key)
 {
 	ckhc_t *cell;
@@ -67,7 +67,7 @@ ckh_bucket_search(ckh_t *ckh, size_t bucket, const void *key)
 /*
  * Search table for key and return cell number if found; SIZE_T_MAX otherwise.
  */
-JEMALLOC_INLINE size_t
+JEMALLOC_INLINE_C size_t
 ckh_isearch(ckh_t *ckh, const void *key)
 {
 	size_t hashes[2], bucket, cell;
@@ -88,7 +88,7 @@ ckh_isearch(ckh_t *ckh, const void *key)
 	return (cell);
 }
 
-JEMALLOC_INLINE bool
+JEMALLOC_INLINE_C bool
 ckh_try_bucket_insert(ckh_t *ckh, size_t bucket, const void *key,
     const void *data)
 {
@@ -120,7 +120,7 @@ ckh_try_bucket_insert(ckh_t *ckh, size_t bucket, const void *key,
  * eviction/relocation procedure until either success or detection of an
  * eviction/relocation bucket cycle.
  */
-JEMALLOC_INLINE bool
+JEMALLOC_INLINE_C bool
 ckh_evict_reloc_insert(ckh_t *ckh, size_t argbucket, void const **argkey,
     void const **argdata)
 {
@@ -190,7 +190,7 @@ ckh_evict_reloc_insert(ckh_t *ckh, size_t argbucket, void const **argkey,
 	}
 }
 
-JEMALLOC_INLINE bool
+JEMALLOC_INLINE_C bool
 ckh_try_insert(ckh_t *ckh, void const**argkey, void const**argdata)
 {
 	size_t hashes[2], bucket;
@@ -219,7 +219,7 @@ ckh_try_insert(ckh_t *ckh, void const**argkey, void const**argdata)
  * Try to rebuild the hash table from scratch by inserting all items from the
  * old table into the new.
  */
-JEMALLOC_INLINE bool
+JEMALLOC_INLINE_C bool
 ckh_rebuild(ckh_t *ckh, ckhc_t *aTab)
 {
 	size_t count, i, nins;
diff --git a/src/ctl.c b/src/ctl.c
index ebba7c25..cc2c5aef 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -929,7 +929,7 @@ void
 ctl_prefork(void)
 {
 
-	malloc_mutex_lock(&ctl_mtx);
+	malloc_mutex_prefork(&ctl_mtx);
 }
 
 void
@@ -1110,6 +1110,8 @@ label_return:								\
 	return (ret);							\
 }
 
+/******************************************************************************/
+
 CTL_RO_NL_GEN(version, JEMALLOC_VERSION, const char *)
 
 static int
@@ -1131,49 +1133,52 @@ label_return:
 	return (ret);
 }
 
-static int
-thread_tcache_enabled_ctl(const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen)
-{
-	int ret;
-	bool oldval;
-
-	if (config_tcache == false)
-		return (ENOENT);
-
-	oldval = tcache_enabled_get();
-	if (newp != NULL) {
-		if (newlen != sizeof(bool)) {
-			ret = EINVAL;
-			goto label_return;
-		}
-		tcache_enabled_set(*(bool *)newp);
-	}
-	READ(oldval, bool);
-
-	ret = 0;
-label_return:
-	return (ret);
-}
-
-static int
-thread_tcache_flush_ctl(const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen)
-{
-	int ret;
+/******************************************************************************/
 
-	if (config_tcache == false)
-		return (ENOENT);
+CTL_RO_BOOL_CONFIG_GEN(config_debug)
+CTL_RO_BOOL_CONFIG_GEN(config_dss)
+CTL_RO_BOOL_CONFIG_GEN(config_fill)
+CTL_RO_BOOL_CONFIG_GEN(config_lazy_lock)
+CTL_RO_BOOL_CONFIG_GEN(config_mremap)
+CTL_RO_BOOL_CONFIG_GEN(config_munmap)
+CTL_RO_BOOL_CONFIG_GEN(config_prof)
+CTL_RO_BOOL_CONFIG_GEN(config_prof_libgcc)
+CTL_RO_BOOL_CONFIG_GEN(config_prof_libunwind)
+CTL_RO_BOOL_CONFIG_GEN(config_stats)
+CTL_RO_BOOL_CONFIG_GEN(config_tcache)
+CTL_RO_BOOL_CONFIG_GEN(config_tls)
+CTL_RO_BOOL_CONFIG_GEN(config_utrace)
+CTL_RO_BOOL_CONFIG_GEN(config_valgrind)
+CTL_RO_BOOL_CONFIG_GEN(config_xmalloc)
 
-	READONLY();
-	WRITEONLY();
+/******************************************************************************/
 
-	tcache_flush();
+CTL_RO_NL_GEN(opt_abort, opt_abort, bool)
+CTL_RO_NL_GEN(opt_dss, opt_dss, const char *)
+CTL_RO_NL_GEN(opt_lg_chunk, opt_lg_chunk, size_t)
+CTL_RO_NL_GEN(opt_narenas, opt_narenas, size_t)
+CTL_RO_NL_GEN(opt_lg_dirty_mult, opt_lg_dirty_mult, ssize_t)
+CTL_RO_NL_GEN(opt_stats_print, opt_stats_print, bool)
+CTL_RO_NL_CGEN(config_fill, opt_junk, opt_junk, bool)
+CTL_RO_NL_CGEN(config_fill, opt_quarantine, opt_quarantine, size_t)
+CTL_RO_NL_CGEN(config_fill, opt_redzone, opt_redzone, bool)
+CTL_RO_NL_CGEN(config_fill, opt_zero, opt_zero, bool)
+CTL_RO_NL_CGEN(config_utrace, opt_utrace, opt_utrace, bool)
+CTL_RO_NL_CGEN(config_valgrind, opt_valgrind, opt_valgrind, bool)
+CTL_RO_NL_CGEN(config_xmalloc, opt_xmalloc, opt_xmalloc, bool)
+CTL_RO_NL_CGEN(config_tcache, opt_tcache, opt_tcache, bool)
+CTL_RO_NL_CGEN(config_tcache, opt_lg_tcache_max, opt_lg_tcache_max, ssize_t)
+CTL_RO_NL_CGEN(config_prof, opt_prof, opt_prof, bool)
+CTL_RO_NL_CGEN(config_prof, opt_prof_prefix, opt_prof_prefix, const char *)
+CTL_RO_CGEN(config_prof, opt_prof_active, opt_prof_active, bool) /* Mutable. */
+CTL_RO_NL_CGEN(config_prof, opt_lg_prof_sample, opt_lg_prof_sample, size_t)
+CTL_RO_NL_CGEN(config_prof, opt_prof_accum, opt_prof_accum, bool)
+CTL_RO_NL_CGEN(config_prof, opt_lg_prof_interval, opt_lg_prof_interval, ssize_t)
+CTL_RO_NL_CGEN(config_prof, opt_prof_gdump, opt_prof_gdump, bool)
+CTL_RO_NL_CGEN(config_prof, opt_prof_final, opt_prof_final, bool)
+CTL_RO_NL_CGEN(config_prof, opt_prof_leak, opt_prof_leak, bool)
 
-	ret = 0;
-label_return:
-	return (ret);
-}
+/******************************************************************************/
 
 static int
 thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
@@ -1235,50 +1240,49 @@ CTL_RO_NL_CGEN(config_stats, thread_deallocated,
 CTL_RO_NL_CGEN(config_stats, thread_deallocatedp,
     &thread_allocated_tsd_get()->deallocated, uint64_t *)
 
-/******************************************************************************/
+static int
+thread_tcache_enabled_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+	bool oldval;
 
-CTL_RO_BOOL_CONFIG_GEN(config_debug)
-CTL_RO_BOOL_CONFIG_GEN(config_dss)
-CTL_RO_BOOL_CONFIG_GEN(config_fill)
-CTL_RO_BOOL_CONFIG_GEN(config_lazy_lock)
-CTL_RO_BOOL_CONFIG_GEN(config_mremap)
-CTL_RO_BOOL_CONFIG_GEN(config_munmap)
-CTL_RO_BOOL_CONFIG_GEN(config_prof)
-CTL_RO_BOOL_CONFIG_GEN(config_prof_libgcc)
-CTL_RO_BOOL_CONFIG_GEN(config_prof_libunwind)
-CTL_RO_BOOL_CONFIG_GEN(config_stats)
-CTL_RO_BOOL_CONFIG_GEN(config_tcache)
-CTL_RO_BOOL_CONFIG_GEN(config_tls)
-CTL_RO_BOOL_CONFIG_GEN(config_utrace)
-CTL_RO_BOOL_CONFIG_GEN(config_valgrind)
-CTL_RO_BOOL_CONFIG_GEN(config_xmalloc)
+	if (config_tcache == false)
+		return (ENOENT);
 
-/******************************************************************************/
+	oldval = tcache_enabled_get();
+	if (newp != NULL) {
+		if (newlen != sizeof(bool)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		tcache_enabled_set(*(bool *)newp);
+	}
+	READ(oldval, bool);
 
-CTL_RO_NL_GEN(opt_abort, opt_abort, bool)
-CTL_RO_NL_GEN(opt_dss, opt_dss, const char *)
-CTL_RO_NL_GEN(opt_lg_chunk, opt_lg_chunk, size_t)
-CTL_RO_NL_GEN(opt_narenas, opt_narenas, size_t)
-CTL_RO_NL_GEN(opt_lg_dirty_mult, opt_lg_dirty_mult, ssize_t)
-CTL_RO_NL_GEN(opt_stats_print, opt_stats_print, bool)
-CTL_RO_NL_CGEN(config_fill, opt_junk, opt_junk, bool)
-CTL_RO_NL_CGEN(config_fill, opt_zero, opt_zero, bool)
-CTL_RO_NL_CGEN(config_fill, opt_quarantine, opt_quarantine, size_t)
-CTL_RO_NL_CGEN(config_fill, opt_redzone, opt_redzone, bool)
-CTL_RO_NL_CGEN(config_utrace, opt_utrace, opt_utrace, bool)
-CTL_RO_NL_CGEN(config_valgrind, opt_valgrind, opt_valgrind, bool)
-CTL_RO_NL_CGEN(config_xmalloc, opt_xmalloc, opt_xmalloc, bool)
-CTL_RO_NL_CGEN(config_tcache, opt_tcache, opt_tcache, bool)
-CTL_RO_NL_CGEN(config_tcache, opt_lg_tcache_max, opt_lg_tcache_max, ssize_t)
-CTL_RO_NL_CGEN(config_prof, opt_prof, opt_prof, bool)
-CTL_RO_NL_CGEN(config_prof, opt_prof_prefix, opt_prof_prefix, const char *)
-CTL_RO_CGEN(config_prof, opt_prof_active, opt_prof_active, bool) /* Mutable. */
-CTL_RO_NL_CGEN(config_prof, opt_lg_prof_sample, opt_lg_prof_sample, size_t)
-CTL_RO_NL_CGEN(config_prof, opt_lg_prof_interval, opt_lg_prof_interval, ssize_t)
-CTL_RO_NL_CGEN(config_prof, opt_prof_gdump, opt_prof_gdump, bool)
-CTL_RO_NL_CGEN(config_prof, opt_prof_final, opt_prof_final, bool)
-CTL_RO_NL_CGEN(config_prof, opt_prof_leak, opt_prof_leak, bool)
-CTL_RO_NL_CGEN(config_prof, opt_prof_accum, opt_prof_accum, bool)
+	ret = 0;
+label_return:
+	return (ret);
+}
+
+static int
+thread_tcache_flush_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+
+	if (config_tcache == false)
+		return (ENOENT);
+
+	READONLY();
+	WRITEONLY();
+
+	tcache_flush();
+
+	ret = 0;
+label_return:
+	return (ret);
+}
 
 /******************************************************************************/
 
@@ -1390,31 +1394,8 @@ label_return:
 	return (ret);
 }
 
-
 /******************************************************************************/
 
-CTL_RO_NL_GEN(arenas_bin_i_size, arena_bin_info[mib[2]].reg_size, size_t)
-CTL_RO_NL_GEN(arenas_bin_i_nregs, arena_bin_info[mib[2]].nregs, uint32_t)
-CTL_RO_NL_GEN(arenas_bin_i_run_size, arena_bin_info[mib[2]].run_size, size_t)
-static const ctl_named_node_t *
-arenas_bin_i_index(const size_t *mib, size_t miblen, size_t i)
-{
-
-	if (i > NBINS)
-		return (NULL);
-	return (super_arenas_bin_i_node);
-}
-
-CTL_RO_NL_GEN(arenas_lrun_i_size, ((mib[2]+1) << LG_PAGE), size_t)
-static const ctl_named_node_t *
-arenas_lrun_i_index(const size_t *mib, size_t miblen, size_t i)
-{
-
-	if (i > nlclasses)
-		return (NULL);
-	return (super_arenas_lrun_i_node);
-}
-
 static int
 arenas_narenas_ctl(const size_t *mib, size_t miblen, void *oldp,
     size_t *oldlenp, void *newp, size_t newlen)
@@ -1468,7 +1449,28 @@ CTL_RO_NL_GEN(arenas_page, PAGE, size_t)
 CTL_RO_NL_CGEN(config_tcache, arenas_tcache_max, tcache_maxclass, size_t)
 CTL_RO_NL_GEN(arenas_nbins, NBINS, unsigned)
 CTL_RO_NL_CGEN(config_tcache, arenas_nhbins, nhbins, unsigned)
+CTL_RO_NL_GEN(arenas_bin_i_size, arena_bin_info[mib[2]].reg_size, size_t)
+CTL_RO_NL_GEN(arenas_bin_i_nregs, arena_bin_info[mib[2]].nregs, uint32_t)
+CTL_RO_NL_GEN(arenas_bin_i_run_size, arena_bin_info[mib[2]].run_size, size_t)
+static const ctl_named_node_t *
+arenas_bin_i_index(const size_t *mib, size_t miblen, size_t i)
+{
+
+	if (i > NBINS)
+		return (NULL);
+	return (super_arenas_bin_i_node);
+}
+
 CTL_RO_NL_GEN(arenas_nlruns, nlclasses, size_t)
+CTL_RO_NL_GEN(arenas_lrun_i_size, ((mib[2]+1) << LG_PAGE), size_t)
+static const ctl_named_node_t *
+arenas_lrun_i_index(const size_t *mib, size_t miblen, size_t i)
+{
+
+	if (i > nlclasses)
+		return (NULL);
+	return (super_arenas_lrun_i_node);
+}
 
 static int
 arenas_purge_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
@@ -1575,6 +1577,11 @@ CTL_RO_NL_CGEN(config_prof, prof_interval, prof_interval, uint64_t)
 
 /******************************************************************************/
 
+CTL_RO_CGEN(config_stats, stats_cactive, &stats_cactive, size_t *)
+CTL_RO_CGEN(config_stats, stats_allocated, ctl_stats.allocated, size_t)
+CTL_RO_CGEN(config_stats, stats_active, ctl_stats.active, size_t)
+CTL_RO_CGEN(config_stats, stats_mapped, ctl_stats.mapped, size_t)
+
 CTL_RO_CGEN(config_stats, stats_chunks_current, ctl_stats.chunks.current,
     size_t)
 CTL_RO_CGEN(config_stats, stats_chunks_total, ctl_stats.chunks.total, uint64_t)
@@ -1582,6 +1589,20 @@ CTL_RO_CGEN(config_stats, stats_chunks_high, ctl_stats.chunks.high, size_t)
 CTL_RO_CGEN(config_stats, stats_huge_allocated, huge_allocated, size_t)
 CTL_RO_CGEN(config_stats, stats_huge_nmalloc, huge_nmalloc, uint64_t)
 CTL_RO_CGEN(config_stats, stats_huge_ndalloc, huge_ndalloc, uint64_t)
+
+CTL_RO_GEN(stats_arenas_i_dss, ctl_stats.arenas[mib[2]].dss, const char *)
+CTL_RO_GEN(stats_arenas_i_nthreads, ctl_stats.arenas[mib[2]].nthreads, unsigned)
+CTL_RO_GEN(stats_arenas_i_pactive, ctl_stats.arenas[mib[2]].pactive, size_t)
+CTL_RO_GEN(stats_arenas_i_pdirty, ctl_stats.arenas[mib[2]].pdirty, size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_mapped,
+    ctl_stats.arenas[mib[2]].astats.mapped, size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_npurge,
+    ctl_stats.arenas[mib[2]].astats.npurge, uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_nmadvise,
+    ctl_stats.arenas[mib[2]].astats.nmadvise, uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_purged,
+    ctl_stats.arenas[mib[2]].astats.purged, uint64_t)
+
 CTL_RO_CGEN(config_stats, stats_arenas_i_small_allocated,
     ctl_stats.arenas[mib[2]].allocated_small, size_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_small_nmalloc,
@@ -1645,19 +1666,6 @@ stats_arenas_i_lruns_j_index(const size_t *mib, size_t miblen, size_t j)
 	return (super_stats_arenas_i_lruns_j_node);
 }
 
-CTL_RO_GEN(stats_arenas_i_nthreads, ctl_stats.arenas[mib[2]].nthreads, unsigned)
-CTL_RO_GEN(stats_arenas_i_dss, ctl_stats.arenas[mib[2]].dss, const char *)
-CTL_RO_GEN(stats_arenas_i_pactive, ctl_stats.arenas[mib[2]].pactive, size_t)
-CTL_RO_GEN(stats_arenas_i_pdirty, ctl_stats.arenas[mib[2]].pdirty, size_t)
-CTL_RO_CGEN(config_stats, stats_arenas_i_mapped,
-    ctl_stats.arenas[mib[2]].astats.mapped, size_t)
-CTL_RO_CGEN(config_stats, stats_arenas_i_npurge,
-    ctl_stats.arenas[mib[2]].astats.npurge, uint64_t)
-CTL_RO_CGEN(config_stats, stats_arenas_i_nmadvise,
-    ctl_stats.arenas[mib[2]].astats.nmadvise, uint64_t)
-CTL_RO_CGEN(config_stats, stats_arenas_i_purged,
-    ctl_stats.arenas[mib[2]].astats.purged, uint64_t)
-
 static const ctl_named_node_t *
 stats_arenas_i_index(const size_t *mib, size_t miblen, size_t i)
 {
@@ -1674,8 +1682,3 @@ label_return:
 	malloc_mutex_unlock(&ctl_mtx);
 	return (ret);
 }
-
-CTL_RO_CGEN(config_stats, stats_cactive, &stats_cactive, size_t *)
-CTL_RO_CGEN(config_stats, stats_allocated, ctl_stats.allocated, size_t)
-CTL_RO_CGEN(config_stats, stats_active, ctl_stats.active, size_t)
-CTL_RO_CGEN(config_stats, stats_mapped, ctl_stats.mapped, size_t)
diff --git a/src/huge.c b/src/huge.c
index aa08d43d..cecaf2df 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -78,7 +78,7 @@ huge_palloc(size_t size, size_t alignment, bool zero)
 	return (ret);
 }
 
-void *
+bool
 huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
 {
 
@@ -89,15 +89,11 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
 	    && CHUNK_CEILING(oldsize) >= CHUNK_CEILING(size)
 	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(size+extra)) {
 		assert(CHUNK_CEILING(oldsize) == oldsize);
-		if (config_fill && opt_junk && size < oldsize) {
-			memset((void *)((uintptr_t)ptr + size), 0x5a,
-			    oldsize - size);
-		}
-		return (ptr);
+		return (false);
 	}
 
 	/* Reallocation would require a move. */
-	return (NULL);
+	return (true);
 }
 
 void *
@@ -108,9 +104,8 @@ huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
 	size_t copysize;
 
 	/* Try to avoid moving the allocation. */
-	ret = huge_ralloc_no_move(ptr, oldsize, size, extra);
-	if (ret != NULL)
-		return (ret);
+	if (huge_ralloc_no_move(ptr, oldsize, size, extra) == false)
+		return (ptr);
 
 	/*
 	 * size and oldsize are different enough that we need to use a
@@ -169,7 +164,7 @@ huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
 			 */
 			char buf[BUFERROR_BUF];
 
-			buferror(buf, sizeof(buf));
+			buferror(get_errno(), buf, sizeof(buf));
 			malloc_printf("<jemalloc>: Error in mremap(): %s\n",
 			    buf);
 			if (opt_abort)
@@ -181,11 +176,34 @@ huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
 #endif
 	{
 		memcpy(ret, ptr, copysize);
-		iqallocx(ptr, try_tcache_dalloc);
+		iqalloct(ptr, try_tcache_dalloc);
 	}
 	return (ret);
 }
 
+#ifdef JEMALLOC_JET
+#undef huge_dalloc_junk
+#define	huge_dalloc_junk JEMALLOC_N(huge_dalloc_junk_impl)
+#endif
+static void
+huge_dalloc_junk(void *ptr, size_t usize)
+{
+
+	if (config_fill && config_dss && opt_junk) {
+		/*
+		 * Only bother junk filling if the chunk isn't about to be
+		 * unmapped.
+		 */
+		if (config_munmap == false || (config_dss && chunk_in_dss(ptr)))
+			memset(ptr, 0x5a, usize);
+	}
+}
+#ifdef JEMALLOC_JET
+#undef huge_dalloc_junk
+#define	huge_dalloc_junk JEMALLOC_N(huge_dalloc_junk)
+huge_dalloc_junk_t *huge_dalloc_junk = JEMALLOC_N(huge_dalloc_junk_impl);
+#endif
+
 void
 huge_dalloc(void *ptr, bool unmap)
 {
@@ -208,8 +226,8 @@ huge_dalloc(void *ptr, bool unmap)
 
 	malloc_mutex_unlock(&huge_mtx);
 
-	if (unmap && config_fill && config_dss && opt_junk)
-		memset(node->addr, 0x5a, node->size);
+	if (unmap)
+		huge_dalloc_junk(node->addr, node->size);
 
 	chunk_dealloc(node->addr, node->size, unmap);
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index ae56db6b..563d99f8 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -100,18 +100,12 @@ typedef struct {
 #endif
 
 /******************************************************************************/
-/* Function prototypes for non-inline static functions. */
-
-static void	stats_print_atexit(void);
-static unsigned	malloc_ncpus(void);
-static bool	malloc_conf_next(char const **opts_p, char const **k_p,
-    size_t *klen_p, char const **v_p, size_t *vlen_p);
-static void	malloc_conf_error(const char *msg, const char *k, size_t klen,
-    const char *v, size_t vlen);
-static void	malloc_conf_init(void);
+/*
+ * Function prototypes for static functions that are referenced prior to
+ * definition.
+ */
+
 static bool	malloc_init_hard(void);
-static int	imemalign(void **memptr, size_t alignment, size_t size,
-    size_t min_alignment);
 
 /******************************************************************************/
 /*
@@ -252,7 +246,6 @@ stats_print_atexit(void)
 static unsigned
 malloc_ncpus(void)
 {
-	unsigned ret;
 	long result;
 
 #ifdef _WIN32
@@ -262,14 +255,7 @@ malloc_ncpus(void)
 #else
 	result = sysconf(_SC_NPROCESSORS_ONLN);
 #endif
-	if (result == -1) {
-		/* Error. */
-		ret = 1;
-	}  else {
-    ret = (unsigned)result;
-  }
-
-	return (ret);
+	return ((result == -1) ? 1 : (unsigned)result);
 }
 
 void
@@ -484,8 +470,7 @@ malloc_conf_init(void)
 			}
 			break;
 		} default:
-			/* NOTREACHED */
-			assert(false);
+			not_reached();
 			buf[0] = '\0';
 			opts = buf;
 		}
@@ -522,14 +507,15 @@ malloc_conf_init(void)
 					    "Invalid conf value",	\
 					    k, klen, v, vlen);		\
 				} else if (clip) {			\
-					if (um < min)			\
+					if (min != 0 && um < min)	\
 						o = min;		\
 					else if (um > max)		\
 						o = max;		\
 					else				\
 						o = um;			\
 				} else {				\
-					if (um < min || um > max) {	\
+					if ((min != 0 && um < min) ||	\
+					    um > max) {			\
 						malloc_conf_error(	\
 						    "Out-of-range "	\
 						    "conf value",	\
@@ -695,17 +681,6 @@ malloc_init_hard(void)
 
 	malloc_conf_init();
 
-#if (!defined(JEMALLOC_MUTEX_INIT_CB) && !defined(JEMALLOC_ZONE) \
-    && !defined(_WIN32))
-	/* Register fork handlers. */
-	if (pthread_atfork(jemalloc_prefork, jemalloc_postfork_parent,
-	    jemalloc_postfork_child) != 0) {
-		malloc_write("<jemalloc>: Error in pthread_atfork()\n");
-		if (opt_abort)
-			abort();
-	}
-#endif
-
 	if (opt_stats_print) {
 		/* Print statistics at exit. */
 		if (atexit(stats_print_atexit) != 0) {
@@ -745,8 +720,10 @@ malloc_init_hard(void)
 		return (true);
 	}
 
-	if (malloc_mutex_init(&arenas_lock))
+	if (malloc_mutex_init(&arenas_lock)) {
+		malloc_mutex_unlock(&init_lock);
 		return (true);
+	}
 
 	/*
 	 * Create enough scaffolding to allow recursive allocation in
@@ -792,9 +769,25 @@ malloc_init_hard(void)
 		return (true);
 	}
 
-	/* Get number of CPUs. */
 	malloc_mutex_unlock(&init_lock);
+	/**********************************************************************/
+	/* Recursive allocation may follow. */
+
 	ncpus = malloc_ncpus();
+
+#if (!defined(JEMALLOC_MUTEX_INIT_CB) && !defined(JEMALLOC_ZONE) \
+    && !defined(_WIN32))
+	/* LinuxThreads's pthread_atfork() allocates. */
+	if (pthread_atfork(jemalloc_prefork, jemalloc_postfork_parent,
+	    jemalloc_postfork_child) != 0) {
+		malloc_write("<jemalloc>: Error in pthread_atfork()\n");
+		if (opt_abort)
+			abort();
+	}
+#endif
+
+	/* Done recursively allocating. */
+	/**********************************************************************/
 	malloc_mutex_lock(&init_lock);
 
 	if (mutex_boot()) {
@@ -841,6 +834,7 @@ malloc_init_hard(void)
 
 	malloc_initialized = true;
 	malloc_mutex_unlock(&init_lock);
+
 	return (false);
 }
 
@@ -852,42 +846,88 @@ malloc_init_hard(void)
  * Begin malloc(3)-compatible functions.
  */
 
+static void *
+imalloc_prof_sample(size_t usize, prof_thr_cnt_t *cnt)
+{
+	void *p;
+
+	if (cnt == NULL)
+		return (NULL);
+	if (prof_promote && usize <= SMALL_MAXCLASS) {
+		p = imalloc(SMALL_MAXCLASS+1);
+		if (p == NULL)
+			return (NULL);
+		arena_prof_promoted(p, usize);
+	} else
+		p = imalloc(usize);
+
+	return (p);
+}
+
+JEMALLOC_ALWAYS_INLINE_C void *
+imalloc_prof(size_t usize, prof_thr_cnt_t *cnt)
+{
+	void *p;
+
+	if ((uintptr_t)cnt != (uintptr_t)1U)
+		p = imalloc_prof_sample(usize, cnt);
+	else
+		p = imalloc(usize);
+	if (p == NULL)
+		return (NULL);
+	prof_malloc(p, usize, cnt);
+
+	return (p);
+}
+
+/*
+ * MALLOC_BODY() is a macro rather than a function because its contents are in
+ * the fast path, but inlining would cause reliability issues when determining
+ * how many frames to discard from heap profiling backtraces.
+ */
+#define	MALLOC_BODY(ret, size, usize) do {				\
+	if (malloc_init())						\
+		ret = NULL;						\
+	else {								\
+		if (config_prof && opt_prof) {				\
+			prof_thr_cnt_t *cnt;				\
+									\
+			usize = s2u(size);				\
+			/*						\
+			 * Call PROF_ALLOC_PREP() here rather than in	\
+			 * imalloc_prof() so that imalloc_prof() can be	\
+			 * inlined without introducing uncertainty	\
+			 * about the number of backtrace frames to	\
+			 * ignore.  imalloc_prof() is in the fast path	\
+			 * when heap profiling is enabled, so inlining	\
+			 * is critical to performance.  (For		\
+			 * consistency all callers of PROF_ALLOC_PREP()	\
+			 * are structured similarly, even though e.g.	\
+			 * realloc() isn't called enough for inlining	\
+			 * to be critical.)				\
+			 */						\
+			PROF_ALLOC_PREP(1, usize, cnt);			\
+			ret = imalloc_prof(usize, cnt);			\
+		} else {						\
+			if (config_stats || (config_valgrind &&		\
+			    opt_valgrind))				\
+				usize = s2u(size);			\
+			ret = imalloc(size);				\
+		}							\
+	}								\
+} while (0)
+
 void *
 je_malloc(size_t size)
 {
 	void *ret;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
-	prof_thr_cnt_t *cnt JEMALLOC_CC_SILENCE_INIT(NULL);
-
-	if (malloc_init()) {
-		ret = NULL;
-		goto label_oom;
-	}
 
 	if (size == 0)
 		size = 1;
 
-	if (config_prof && opt_prof) {
-		usize = s2u(size);
-		PROF_ALLOC_PREP(1, usize, cnt);
-		if (cnt == NULL) {
-			ret = NULL;
-			goto label_oom;
-		}
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize <=
-		    SMALL_MAXCLASS) {
-			ret = imalloc(SMALL_MAXCLASS+1);
-			if (ret != NULL)
-				arena_prof_promoted(ret, usize);
-		} else
-			ret = imalloc(size);
-	} else {
-		if (config_stats || (config_valgrind && opt_valgrind))
-			usize = s2u(size);
-		ret = imalloc(size);
-	}
+	MALLOC_BODY(ret, size, usize);
 
-label_oom:
 	if (ret == NULL) {
 		if (config_xmalloc && opt_xmalloc) {
 			malloc_write("<jemalloc>: Error in malloc(): "
@@ -896,8 +936,6 @@ label_oom:
 		}
 		set_errno(ENOMEM);
 	}
-	if (config_prof && opt_prof && ret != NULL)
-		prof_malloc(ret, usize, cnt);
 	if (config_stats && ret != NULL) {
 		assert(usize == isalloc(ret, config_prof));
 		thread_allocated_tsd_get()->allocated += usize;
@@ -907,6 +945,42 @@ label_oom:
 	return (ret);
 }
 
+static void *
+imemalign_prof_sample(size_t alignment, size_t usize, prof_thr_cnt_t *cnt)
+{
+	void *p;
+
+	if (cnt == NULL)
+		return (NULL);
+	if (prof_promote && usize <= SMALL_MAXCLASS) {
+		assert(sa2u(SMALL_MAXCLASS+1, alignment) != 0);
+		p = ipalloc(sa2u(SMALL_MAXCLASS+1, alignment), alignment,
+		    false);
+		if (p == NULL)
+			return (NULL);
+		arena_prof_promoted(p, usize);
+	} else
+		p = ipalloc(usize, alignment, false);
+
+	return (p);
+}
+
+JEMALLOC_ALWAYS_INLINE_C void *
+imemalign_prof(size_t alignment, size_t usize, prof_thr_cnt_t *cnt)
+{
+	void *p;
+
+	if ((uintptr_t)cnt != (uintptr_t)1U)
+		p = imemalign_prof_sample(alignment, usize, cnt);
+	else
+		p = ipalloc(usize, alignment, false);
+	if (p == NULL)
+		return (NULL);
+	prof_malloc(p, usize, cnt);
+
+	return (p);
+}
+
 JEMALLOC_ATTR(nonnull(1))
 #ifdef JEMALLOC_PROF
 /*
@@ -916,19 +990,18 @@ JEMALLOC_ATTR(nonnull(1))
 JEMALLOC_NOINLINE
 #endif
 static int
-imemalign(void **memptr, size_t alignment, size_t size,
-    size_t min_alignment)
+imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 {
 	int ret;
 	size_t usize;
 	void *result;
-	prof_thr_cnt_t *cnt JEMALLOC_CC_SILENCE_INIT(NULL);
 
 	assert(min_alignment != 0);
 
-	if (malloc_init())
+	if (malloc_init()) {
 		result = NULL;
-	else {
+		goto label_oom;
+	} else {
 		if (size == 0)
 			size = 1;
 
@@ -948,57 +1021,38 @@ imemalign(void **memptr, size_t alignment, size_t size,
 		usize = sa2u(size, alignment);
 		if (usize == 0) {
 			result = NULL;
-			ret = ENOMEM;
-			goto label_return;
+			goto label_oom;
 		}
 
 		if (config_prof && opt_prof) {
+			prof_thr_cnt_t *cnt;
+
 			PROF_ALLOC_PREP(2, usize, cnt);
-			if (cnt == NULL) {
-				result = NULL;
-				ret = EINVAL;
-			} else {
-				if (prof_promote && (uintptr_t)cnt !=
-				    (uintptr_t)1U && usize <= SMALL_MAXCLASS) {
-					assert(sa2u(SMALL_MAXCLASS+1,
-					    alignment) != 0);
-					result = ipalloc(sa2u(SMALL_MAXCLASS+1,
-					    alignment), alignment, false);
-					if (result != NULL) {
-						arena_prof_promoted(result,
-						    usize);
-					}
-				} else {
-					result = ipalloc(usize, alignment,
-					    false);
-				}
-			}
+			result = imemalign_prof(alignment, usize, cnt);
 		} else
 			result = ipalloc(usize, alignment, false);
-	}
-
-	if (result == NULL) {
-		if (config_xmalloc && opt_xmalloc) {
-			malloc_write("<jemalloc>: Error allocating aligned "
-			    "memory: out of memory\n");
-			abort();
-		}
-		ret = ENOMEM;
-		goto label_return;
+		if (result == NULL)
+			goto label_oom;
 	}
 
 	*memptr = result;
 	ret = 0;
-
 label_return:
 	if (config_stats && result != NULL) {
 		assert(usize == isalloc(result, config_prof));
 		thread_allocated_tsd_get()->allocated += usize;
 	}
-	if (config_prof && opt_prof && result != NULL)
-		prof_malloc(result, usize, cnt);
 	UTRACE(0, size, result);
 	return (ret);
+label_oom:
+	assert(result == NULL);
+	if (config_xmalloc && opt_xmalloc) {
+		malloc_write("<jemalloc>: Error allocating aligned memory: "
+		    "out of memory\n");
+		abort();
+	}
+	ret = ENOMEM;
+	goto label_return;
 }
 
 int
@@ -1025,13 +1079,46 @@ je_aligned_alloc(size_t alignment, size_t size)
 	return (ret);
 }
 
+static void *
+icalloc_prof_sample(size_t usize, prof_thr_cnt_t *cnt)
+{
+	void *p;
+
+	if (cnt == NULL)
+		return (NULL);
+	if (prof_promote && usize <= SMALL_MAXCLASS) {
+		p = icalloc(SMALL_MAXCLASS+1);
+		if (p == NULL)
+			return (NULL);
+		arena_prof_promoted(p, usize);
+	} else
+		p = icalloc(usize);
+
+	return (p);
+}
+
+JEMALLOC_ALWAYS_INLINE_C void *
+icalloc_prof(size_t usize, prof_thr_cnt_t *cnt)
+{
+	void *p;
+
+	if ((uintptr_t)cnt != (uintptr_t)1U)
+		p = icalloc_prof_sample(usize, cnt);
+	else
+		p = icalloc(usize);
+	if (p == NULL)
+		return (NULL);
+	prof_malloc(p, usize, cnt);
+
+	return (p);
+}
+
 void *
 je_calloc(size_t num, size_t size)
 {
 	void *ret;
 	size_t num_size;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
-	prof_thr_cnt_t *cnt JEMALLOC_CC_SILENCE_INIT(NULL);
 
 	if (malloc_init()) {
 		num_size = 0;
@@ -1060,19 +1147,11 @@ je_calloc(size_t num, size_t size)
 	}
 
 	if (config_prof && opt_prof) {
+		prof_thr_cnt_t *cnt;
+
 		usize = s2u(num_size);
 		PROF_ALLOC_PREP(1, usize, cnt);
-		if (cnt == NULL) {
-			ret = NULL;
-			goto label_return;
-		}
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize
-		    <= SMALL_MAXCLASS) {
-			ret = icalloc(SMALL_MAXCLASS+1);
-			if (ret != NULL)
-				arena_prof_promoted(ret, usize);
-		} else
-			ret = icalloc(num_size);
+		ret = icalloc_prof(usize, cnt);
 	} else {
 		if (config_stats || (config_valgrind && opt_valgrind))
 			usize = s2u(num_size);
@@ -1088,9 +1167,6 @@ label_return:
 		}
 		set_errno(ENOMEM);
 	}
-
-	if (config_prof && opt_prof && ret != NULL)
-		prof_malloc(ret, usize, cnt);
 	if (config_stats && ret != NULL) {
 		assert(usize == isalloc(ret, config_prof));
 		thread_allocated_tsd_get()->allocated += usize;
@@ -1100,152 +1176,126 @@ label_return:
 	return (ret);
 }
 
+static void *
+irealloc_prof_sample(void *oldptr, size_t usize, prof_thr_cnt_t *cnt)
+{
+	void *p;
+
+	if (cnt == NULL)
+		return (NULL);
+	if (prof_promote && usize <= SMALL_MAXCLASS) {
+		p = iralloc(oldptr, SMALL_MAXCLASS+1, 0, 0, false);
+		if (p == NULL)
+			return (NULL);
+		arena_prof_promoted(p, usize);
+	} else
+		p = iralloc(oldptr, usize, 0, 0, false);
+
+	return (p);
+}
+
+JEMALLOC_ALWAYS_INLINE_C void *
+irealloc_prof(void *oldptr, size_t old_usize, size_t usize, prof_thr_cnt_t *cnt)
+{
+	void *p;
+	prof_ctx_t *old_ctx;
+
+	old_ctx = prof_ctx_get(oldptr);
+	if ((uintptr_t)cnt != (uintptr_t)1U)
+		p = irealloc_prof_sample(oldptr, usize, cnt);
+	else
+		p = iralloc(oldptr, usize, 0, 0, false);
+	if (p == NULL)
+		return (NULL);
+	prof_realloc(p, usize, cnt, old_usize, old_ctx);
+
+	return (p);
+}
+
+JEMALLOC_INLINE_C void
+ifree(void *ptr)
+{
+	size_t usize;
+	UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
+
+	assert(ptr != NULL);
+	assert(malloc_initialized || IS_INITIALIZER);
+
+	if (config_prof && opt_prof) {
+		usize = isalloc(ptr, config_prof);
+		prof_free(ptr, usize);
+	} else if (config_stats || config_valgrind)
+		usize = isalloc(ptr, config_prof);
+	if (config_stats)
+		thread_allocated_tsd_get()->deallocated += usize;
+	if (config_valgrind && opt_valgrind)
+		rzsize = p2rz(ptr);
+	iqalloc(ptr);
+	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
+}
+
 void *
 je_realloc(void *ptr, size_t size)
 {
 	void *ret;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
-	size_t old_size = 0;
-	size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
-	prof_thr_cnt_t *cnt JEMALLOC_CC_SILENCE_INIT(NULL);
-	prof_ctx_t *old_ctx JEMALLOC_CC_SILENCE_INIT(NULL);
+	size_t old_usize = 0;
+	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
 
 	if (size == 0) {
 		if (ptr != NULL) {
-			/* realloc(ptr, 0) is equivalent to free(p). */
-			assert(malloc_initialized || IS_INITIALIZER);
-			if (config_prof) {
-				old_size = isalloc(ptr, true);
-				if (config_valgrind && opt_valgrind)
-					old_rzsize = p2rz(ptr);
-			} else if (config_stats) {
-				old_size = isalloc(ptr, false);
-				if (config_valgrind && opt_valgrind)
-					old_rzsize = u2rz(old_size);
-			} else if (config_valgrind && opt_valgrind) {
-				old_size = isalloc(ptr, false);
-				old_rzsize = u2rz(old_size);
-			}
-			if (config_prof && opt_prof) {
-				old_ctx = prof_ctx_get(ptr);
-				cnt = NULL;
-			}
-			iqalloc(ptr);
-			ret = NULL;
-			goto label_return;
-		} else
-			size = 1;
+			/* realloc(ptr, 0) is equivalent to free(ptr). */
+			UTRACE(ptr, 0, 0);
+			ifree(ptr);
+			return (NULL);
+		}
+		size = 1;
 	}
 
 	if (ptr != NULL) {
 		assert(malloc_initialized || IS_INITIALIZER);
 		malloc_thread_init();
 
-		if (config_prof) {
-			old_size = isalloc(ptr, true);
-			if (config_valgrind && opt_valgrind)
-				old_rzsize = p2rz(ptr);
-		} else if (config_stats) {
-			old_size = isalloc(ptr, false);
-			if (config_valgrind && opt_valgrind)
-				old_rzsize = u2rz(old_size);
-		} else if (config_valgrind && opt_valgrind) {
-			old_size = isalloc(ptr, false);
-			old_rzsize = u2rz(old_size);
-		}
+		if ((config_prof && opt_prof) || config_stats ||
+		    (config_valgrind && opt_valgrind))
+			old_usize = isalloc(ptr, config_prof);
+		if (config_valgrind && opt_valgrind)
+			old_rzsize = config_prof ? p2rz(ptr) : u2rz(old_usize);
+
 		if (config_prof && opt_prof) {
+			prof_thr_cnt_t *cnt;
+
 			usize = s2u(size);
-			old_ctx = prof_ctx_get(ptr);
 			PROF_ALLOC_PREP(1, usize, cnt);
-			if (cnt == NULL) {
-				old_ctx = NULL;
-				ret = NULL;
-				goto label_oom;
-			}
-			if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U &&
-			    usize <= SMALL_MAXCLASS) {
-				ret = iralloc(ptr, SMALL_MAXCLASS+1, 0, 0,
-				    false, false);
-				if (ret != NULL)
-					arena_prof_promoted(ret, usize);
-				else
-					old_ctx = NULL;
-			} else {
-				ret = iralloc(ptr, size, 0, 0, false, false);
-				if (ret == NULL)
-					old_ctx = NULL;
-			}
+			ret = irealloc_prof(ptr, old_usize, usize, cnt);
 		} else {
 			if (config_stats || (config_valgrind && opt_valgrind))
 				usize = s2u(size);
-			ret = iralloc(ptr, size, 0, 0, false, false);
-		}
-
-label_oom:
-		if (ret == NULL) {
-			if (config_xmalloc && opt_xmalloc) {
-				malloc_write("<jemalloc>: Error in realloc(): "
-				    "out of memory\n");
-				abort();
-			}
-			set_errno(ENOMEM);
+			ret = iralloc(ptr, size, 0, 0, false);
 		}
 	} else {
 		/* realloc(NULL, size) is equivalent to malloc(size). */
-		if (config_prof && opt_prof)
-			old_ctx = NULL;
-		if (malloc_init()) {
-			if (config_prof && opt_prof)
-				cnt = NULL;
-			ret = NULL;
-		} else {
-			if (config_prof && opt_prof) {
-				usize = s2u(size);
-				PROF_ALLOC_PREP(1, usize, cnt);
-				if (cnt == NULL)
-					ret = NULL;
-				else {
-					if (prof_promote && (uintptr_t)cnt !=
-					    (uintptr_t)1U && usize <=
-					    SMALL_MAXCLASS) {
-						ret = imalloc(SMALL_MAXCLASS+1);
-						if (ret != NULL) {
-							arena_prof_promoted(ret,
-							    usize);
-						}
-					} else
-						ret = imalloc(size);
-				}
-			} else {
-				if (config_stats || (config_valgrind &&
-				    opt_valgrind))
-					usize = s2u(size);
-				ret = imalloc(size);
-			}
-		}
+		MALLOC_BODY(ret, size, usize);
+	}
 
-		if (ret == NULL) {
-			if (config_xmalloc && opt_xmalloc) {
-				malloc_write("<jemalloc>: Error in realloc(): "
-				    "out of memory\n");
-				abort();
-			}
-			set_errno(ENOMEM);
+	if (ret == NULL) {
+		if (config_xmalloc && opt_xmalloc) {
+			malloc_write("<jemalloc>: Error in realloc(): "
+			    "out of memory\n");
+			abort();
 		}
+		set_errno(ENOMEM);
 	}
-
-label_return:
-	if (config_prof && opt_prof)
-		prof_realloc(ret, usize, cnt, old_size, old_ctx);
 	if (config_stats && ret != NULL) {
 		thread_allocated_t *ta;
 		assert(usize == isalloc(ret, config_prof));
 		ta = thread_allocated_tsd_get();
 		ta->allocated += usize;
-		ta->deallocated += old_size;
+		ta->deallocated += old_usize;
 	}
 	UTRACE(ptr, size, ret);
-	JEMALLOC_VALGRIND_REALLOC(ret, usize, ptr, old_size, old_rzsize, false);
+	JEMALLOC_VALGRIND_REALLOC(ret, usize, ptr, old_usize, old_rzsize,
+	    false);
 	return (ret);
 }
 
@@ -1254,24 +1304,8 @@ je_free(void *ptr)
 {
 
 	UTRACE(ptr, 0, 0);
-	if (ptr != NULL) {
-		size_t usize;
-		size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
-
-		assert(malloc_initialized || IS_INITIALIZER);
-
-		if (config_prof && opt_prof) {
-			usize = isalloc(ptr, config_prof);
-			prof_free(ptr, usize);
-		} else if (config_stats || config_valgrind)
-			usize = isalloc(ptr, config_prof);
-		if (config_stats)
-			thread_allocated_tsd_get()->deallocated += usize;
-		if (config_valgrind && opt_valgrind)
-			rzsize = p2rz(ptr);
-		iqalloc(ptr);
-		JEMALLOC_VALGRIND_FREE(ptr, rzsize);
-	}
+	if (ptr != NULL)
+		ifree(ptr);
 }
 
 /*
@@ -1337,208 +1371,344 @@ JEMALLOC_EXPORT void *(* __memalign_hook)(size_t alignment, size_t size) =
  * Begin non-standard functions.
  */
 
-size_t
-je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr)
+JEMALLOC_ALWAYS_INLINE_C void *
+imallocx(size_t usize, size_t alignment, bool zero, bool try_tcache,
+    arena_t *arena)
 {
-	size_t ret;
 
-	assert(malloc_initialized || IS_INITIALIZER);
-	malloc_thread_init();
+	assert(usize == ((alignment == 0) ? s2u(usize) : sa2u(usize,
+	    alignment)));
 
-	if (config_ivsalloc)
-		ret = ivsalloc(ptr, config_prof);
+	if (alignment != 0)
+		return (ipalloct(usize, alignment, zero, try_tcache, arena));
+	else if (zero)
+		return (icalloct(usize, try_tcache, arena));
 	else
-		ret = (ptr != NULL) ? isalloc(ptr, config_prof) : 0;
-
-	return (ret);
+		return (imalloct(usize, try_tcache, arena));
 }
 
-void
-je_malloc_stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
-    const char *opts)
+static void *
+imallocx_prof_sample(size_t usize, size_t alignment, bool zero, bool try_tcache,
+    arena_t *arena, prof_thr_cnt_t *cnt)
 {
+	void *p;
 
-	stats_print(write_cb, cbopaque, opts);
+	if (cnt == NULL)
+		return (NULL);
+	if (prof_promote && usize <= SMALL_MAXCLASS) {
+		size_t usize_promoted = (alignment == 0) ?
+		    s2u(SMALL_MAXCLASS+1) : sa2u(SMALL_MAXCLASS+1, alignment);
+		assert(usize_promoted != 0);
+		p = imallocx(usize_promoted, alignment, zero, try_tcache,
+		    arena);
+		if (p == NULL)
+			return (NULL);
+		arena_prof_promoted(p, usize);
+	} else
+		p = imallocx(usize, alignment, zero, try_tcache, arena);
+
+	return (p);
 }
 
-int
-je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp,
-    size_t newlen)
+JEMALLOC_ALWAYS_INLINE_C void *
+imallocx_prof(size_t usize, size_t alignment, bool zero, bool try_tcache,
+    arena_t *arena, prof_thr_cnt_t *cnt)
 {
+	void *p;
 
-	if (malloc_init())
-		return (EAGAIN);
+	if ((uintptr_t)cnt != (uintptr_t)1U) {
+		p = imallocx_prof_sample(usize, alignment, zero, try_tcache,
+		    arena, cnt);
+	} else
+		p = imallocx(usize, alignment, zero, try_tcache, arena);
+	if (p == NULL)
+		return (NULL);
+	prof_malloc(p, usize, cnt);
 
-	return (ctl_byname(name, oldp, oldlenp, newp, newlen));
+	return (p);
 }
 
-int
-je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp)
+void *
+je_mallocx(size_t size, int flags)
 {
+	void *p;
+	size_t usize;
+	size_t alignment = (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK)
+	    & (SIZE_T_MAX-1));
+	bool zero = flags & MALLOCX_ZERO;
+	unsigned arena_ind = ((unsigned)(flags >> 8)) - 1;
+	arena_t *arena;
+	bool try_tcache;
+
+	assert(size != 0);
 
 	if (malloc_init())
-		return (EAGAIN);
+		goto label_oom;
 
-	return (ctl_nametomib(name, mibp, miblenp));
+	if (arena_ind != UINT_MAX) {
+		arena = arenas[arena_ind];
+		try_tcache = false;
+	} else {
+		arena = NULL;
+		try_tcache = true;
+	}
+
+	usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
+	assert(usize != 0);
+
+	if (config_prof && opt_prof) {
+		prof_thr_cnt_t *cnt;
+
+		PROF_ALLOC_PREP(1, usize, cnt);
+		p = imallocx_prof(usize, alignment, zero, try_tcache, arena,
+		    cnt);
+	} else
+		p = imallocx(usize, alignment, zero, try_tcache, arena);
+	if (p == NULL)
+		goto label_oom;
+
+	if (config_stats) {
+		assert(usize == isalloc(p, config_prof));
+		thread_allocated_tsd_get()->allocated += usize;
+	}
+	UTRACE(0, size, p);
+	JEMALLOC_VALGRIND_MALLOC(true, p, usize, zero);
+	return (p);
+label_oom:
+	if (config_xmalloc && opt_xmalloc) {
+		malloc_write("<jemalloc>: Error in mallocx(): out of memory\n");
+		abort();
+	}
+	UTRACE(0, size, 0);
+	return (NULL);
 }
 
-int
-je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
-  void *newp, size_t newlen)
+static void *
+irallocx_prof_sample(void *oldptr, size_t size, size_t alignment, size_t usize,
+    bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena,
+    prof_thr_cnt_t *cnt)
 {
+	void *p;
 
-	if (malloc_init())
-		return (EAGAIN);
+	if (cnt == NULL)
+		return (NULL);
+	if (prof_promote && usize <= SMALL_MAXCLASS) {
+		p = iralloct(oldptr, SMALL_MAXCLASS+1, (SMALL_MAXCLASS+1 >=
+		    size) ? 0 : size - (SMALL_MAXCLASS+1), alignment, zero,
+		    try_tcache_alloc, try_tcache_dalloc, arena);
+		if (p == NULL)
+			return (NULL);
+		arena_prof_promoted(p, usize);
+	} else {
+		p = iralloct(oldptr, size, 0, alignment, zero,
+		    try_tcache_alloc, try_tcache_dalloc, arena);
+	}
 
-	return (ctl_bymib(mib, miblen, oldp, oldlenp, newp, newlen));
+	return (p);
 }
 
-/*
- * End non-standard functions.
- */
-/******************************************************************************/
-/*
- * Begin experimental functions.
- */
-#ifdef JEMALLOC_EXPERIMENTAL
-
 JEMALLOC_ALWAYS_INLINE_C void *
-iallocm(size_t usize, size_t alignment, bool zero, bool try_tcache,
-    arena_t *arena)
+irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
+    size_t *usize, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
+    arena_t *arena, prof_thr_cnt_t *cnt)
 {
+	void *p;
+	prof_ctx_t *old_ctx;
 
-	assert(usize == ((alignment == 0) ? s2u(usize) : sa2u(usize,
-	    alignment)));
+	old_ctx = prof_ctx_get(oldptr);
+	if ((uintptr_t)cnt != (uintptr_t)1U)
+		p = irallocx_prof_sample(oldptr, size, alignment, *usize, zero,
+		    try_tcache_alloc, try_tcache_dalloc, arena, cnt);
+	else {
+		p = iralloct(oldptr, size, 0, alignment, zero,
+		    try_tcache_alloc, try_tcache_dalloc, arena);
+	}
+	if (p == NULL)
+		return (NULL);
 
-	if (alignment != 0)
-		return (ipallocx(usize, alignment, zero, try_tcache, arena));
-	else if (zero)
-		return (icallocx(usize, try_tcache, arena));
-	else
-		return (imallocx(usize, try_tcache, arena));
+	if (p == oldptr && alignment != 0) {
+		/*
+		 * The allocation did not move, so it is possible that the size
+		 * class is smaller than would guarantee the requested
+		 * alignment, and that the alignment constraint was
+		 * serendipitously satisfied.  Additionally, old_usize may not
+		 * be the same as the current usize because of in-place large
+		 * reallocation.  Therefore, query the actual value of usize.
+		 */
+		*usize = isalloc(p, config_prof);
+	}
+	prof_realloc(p, *usize, cnt, old_usize, old_ctx);
+
+	return (p);
 }
 
-int
-je_allocm(void **ptr, size_t *rsize, size_t size, int flags)
+void *
+je_rallocx(void *ptr, size_t size, int flags)
 {
 	void *p;
-	size_t usize;
-	size_t alignment = (ZU(1) << (flags & ALLOCM_LG_ALIGN_MASK)
+	size_t usize, old_usize;
+	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
+	size_t alignment = (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK)
 	    & (SIZE_T_MAX-1));
-	bool zero = flags & ALLOCM_ZERO;
+	bool zero = flags & MALLOCX_ZERO;
 	unsigned arena_ind = ((unsigned)(flags >> 8)) - 1;
+	bool try_tcache_alloc, try_tcache_dalloc;
 	arena_t *arena;
-	bool try_tcache;
 
 	assert(ptr != NULL);
 	assert(size != 0);
-
-	if (malloc_init())
-		goto label_oom;
+	assert(malloc_initialized || IS_INITIALIZER);
+	malloc_thread_init();
 
 	if (arena_ind != UINT_MAX) {
+		arena_chunk_t *chunk;
+		try_tcache_alloc = false;
+		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+		try_tcache_dalloc = (chunk == ptr || chunk->arena !=
+		    arenas[arena_ind]);
 		arena = arenas[arena_ind];
-		try_tcache = false;
 	} else {
+		try_tcache_alloc = true;
+		try_tcache_dalloc = true;
 		arena = NULL;
-		try_tcache = true;
 	}
 
-	usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
-	if (usize == 0)
-		goto label_oom;
+	if ((config_prof && opt_prof) || config_stats ||
+	    (config_valgrind && opt_valgrind))
+		old_usize = isalloc(ptr, config_prof);
+	if (config_valgrind && opt_valgrind)
+		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
 		prof_thr_cnt_t *cnt;
 
+		usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
+		assert(usize != 0);
 		PROF_ALLOC_PREP(1, usize, cnt);
-		if (cnt == NULL)
+		p = irallocx_prof(ptr, old_usize, size, alignment, &usize, zero,
+		    try_tcache_alloc, try_tcache_dalloc, arena, cnt);
+		if (p == NULL)
 			goto label_oom;
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize <=
-		    SMALL_MAXCLASS) {
-			size_t usize_promoted = (alignment == 0) ?
-			    s2u(SMALL_MAXCLASS+1) : sa2u(SMALL_MAXCLASS+1,
-			    alignment);
-			assert(usize_promoted != 0);
-			p = iallocm(usize_promoted, alignment, zero,
-			    try_tcache, arena);
-			if (p == NULL)
-				goto label_oom;
-			arena_prof_promoted(p, usize);
-		} else {
-			p = iallocm(usize, alignment, zero, try_tcache, arena);
-			if (p == NULL)
-				goto label_oom;
-		}
-		prof_malloc(p, usize, cnt);
 	} else {
-		p = iallocm(usize, alignment, zero, try_tcache, arena);
+		p = iralloct(ptr, size, 0, alignment, zero, try_tcache_alloc,
+		    try_tcache_dalloc, arena);
 		if (p == NULL)
 			goto label_oom;
+		if (config_stats || (config_valgrind && opt_valgrind))
+			usize = isalloc(p, config_prof);
 	}
-	if (rsize != NULL)
-		*rsize = usize;
 
-	*ptr = p;
 	if (config_stats) {
-		assert(usize == isalloc(p, config_prof));
-		thread_allocated_tsd_get()->allocated += usize;
+		thread_allocated_t *ta;
+		ta = thread_allocated_tsd_get();
+		ta->allocated += usize;
+		ta->deallocated += old_usize;
 	}
-	UTRACE(0, size, p);
-	JEMALLOC_VALGRIND_MALLOC(true, p, usize, zero);
-	return (ALLOCM_SUCCESS);
+	UTRACE(ptr, size, p);
+	JEMALLOC_VALGRIND_REALLOC(p, usize, ptr, old_usize, old_rzsize, zero);
+	return (p);
 label_oom:
 	if (config_xmalloc && opt_xmalloc) {
-		malloc_write("<jemalloc>: Error in allocm(): "
-		    "out of memory\n");
+		malloc_write("<jemalloc>: Error in rallocx(): out of memory\n");
 		abort();
 	}
-	*ptr = NULL;
-	UTRACE(0, size, 0);
-	return (ALLOCM_ERR_OOM);
+	UTRACE(ptr, size, 0);
+	return (NULL);
 }
 
-int
-je_rallocm(void **ptr, size_t *rsize, size_t size, size_t extra, int flags)
+JEMALLOC_ALWAYS_INLINE_C size_t
+ixallocx_helper(void *ptr, size_t old_usize, size_t size, size_t extra,
+    size_t alignment, bool zero, arena_t *arena)
+{
+	size_t usize;
+
+	if (ixalloc(ptr, size, extra, alignment, zero))
+		return (old_usize);
+	usize = isalloc(ptr, config_prof);
+
+	return (usize);
+}
+
+static size_t
+ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
+    size_t alignment, size_t max_usize, bool zero, arena_t *arena,
+    prof_thr_cnt_t *cnt)
+{
+	size_t usize;
+
+	if (cnt == NULL)
+		return (old_usize);
+	/* Use minimum usize to determine whether promotion may happen. */
+	if (prof_promote && ((alignment == 0) ? s2u(size) : sa2u(size,
+	    alignment)) <= SMALL_MAXCLASS) {
+		if (ixalloc(ptr, SMALL_MAXCLASS+1, (SMALL_MAXCLASS+1 >=
+		    size+extra) ? 0 : size+extra - (SMALL_MAXCLASS+1),
+		    alignment, zero))
+			return (old_usize);
+		usize = isalloc(ptr, config_prof);
+		if (max_usize < PAGE)
+			arena_prof_promoted(ptr, usize);
+	} else {
+		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
+		    zero, arena);
+	}
+
+	return (usize);
+}
+
+JEMALLOC_ALWAYS_INLINE_C size_t
+ixallocx_prof(void *ptr, size_t old_usize, size_t size, size_t extra,
+    size_t alignment, size_t max_usize, bool zero, arena_t *arena,
+    prof_thr_cnt_t *cnt)
 {
-	void *p, *q;
 	size_t usize;
-	size_t old_size;
-	size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
-	size_t alignment = (ZU(1) << (flags & ALLOCM_LG_ALIGN_MASK)
+	prof_ctx_t *old_ctx;
+
+	old_ctx = prof_ctx_get(ptr);
+	if ((uintptr_t)cnt != (uintptr_t)1U) {
+		usize = ixallocx_prof_sample(ptr, old_usize, size, extra,
+		    alignment, zero, max_usize, arena, cnt);
+	} else {
+		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
+		    zero, arena);
+	}
+	if (usize == old_usize)
+		return (usize);
+	prof_realloc(ptr, usize, cnt, old_usize, old_ctx);
+
+	return (usize);
+}
+
+size_t
+je_xallocx(void *ptr, size_t size, size_t extra, int flags)
+{
+	size_t usize, old_usize;
+	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
+	size_t alignment = (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK)
 	    & (SIZE_T_MAX-1));
-	bool zero = flags & ALLOCM_ZERO;
-	bool no_move = flags & ALLOCM_NO_MOVE;
+	bool zero = flags & MALLOCX_ZERO;
 	unsigned arena_ind = ((unsigned)(flags >> 8)) - 1;
-	bool try_tcache_alloc, try_tcache_dalloc;
 	arena_t *arena;
 
 	assert(ptr != NULL);
-	assert(*ptr != NULL);
 	assert(size != 0);
 	assert(SIZE_T_MAX - size >= extra);
 	assert(malloc_initialized || IS_INITIALIZER);
 	malloc_thread_init();
 
-	if (arena_ind != UINT_MAX) {
-		arena_chunk_t *chunk;
-		try_tcache_alloc = true;
-		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(*ptr);
-		try_tcache_dalloc = (chunk == *ptr || chunk->arena !=
-		    arenas[arena_ind]);
+	if (arena_ind != UINT_MAX)
 		arena = arenas[arena_ind];
-	} else {
-		try_tcache_alloc = true;
-		try_tcache_dalloc = true;
+	else
 		arena = NULL;
-	}
 
-	p = *ptr;
+	old_usize = isalloc(ptr, config_prof);
+	if (config_valgrind && opt_valgrind)
+		old_rzsize = u2rz(old_usize);
+
 	if (config_prof && opt_prof) {
 		prof_thr_cnt_t *cnt;
-
 		/*
-		 * usize isn't knowable before iralloc() returns when extra is
+		 * usize isn't knowable before ixalloc() returns when extra is
 		 * non-zero.  Therefore, compute its maximum possible value and
 		 * use that in PROF_ALLOC_PREP() to decide whether to capture a
 		 * backtrace.  prof_realloc() will use the actual usize to
@@ -1546,112 +1716,51 @@ je_rallocm(void **ptr, size_t *rsize, size_t size, size_t extra, int flags)
 		 */
 		size_t max_usize = (alignment == 0) ? s2u(size+extra) :
 		    sa2u(size+extra, alignment);
-		prof_ctx_t *old_ctx = prof_ctx_get(p);
-		old_size = isalloc(p, true);
-		if (config_valgrind && opt_valgrind)
-			old_rzsize = p2rz(p);
 		PROF_ALLOC_PREP(1, max_usize, cnt);
-		if (cnt == NULL)
-			goto label_oom;
-		/*
-		 * Use minimum usize to determine whether promotion may happen.
-		 */
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U
-		    && ((alignment == 0) ? s2u(size) : sa2u(size, alignment))
-		    <= SMALL_MAXCLASS) {
-			q = irallocx(p, SMALL_MAXCLASS+1, (SMALL_MAXCLASS+1 >=
-			    size+extra) ? 0 : size+extra - (SMALL_MAXCLASS+1),
-			    alignment, zero, no_move, try_tcache_alloc,
-			    try_tcache_dalloc, arena);
-			if (q == NULL)
-				goto label_err;
-			if (max_usize < PAGE) {
-				usize = max_usize;
-				arena_prof_promoted(q, usize);
-			} else
-				usize = isalloc(q, config_prof);
-		} else {
-			q = irallocx(p, size, extra, alignment, zero, no_move,
-			    try_tcache_alloc, try_tcache_dalloc, arena);
-			if (q == NULL)
-				goto label_err;
-			usize = isalloc(q, config_prof);
-		}
-		prof_realloc(q, usize, cnt, old_size, old_ctx);
-		if (rsize != NULL)
-			*rsize = usize;
+		usize = ixallocx_prof(ptr, old_usize, size, extra, alignment,
+		    max_usize, zero, arena, cnt);
 	} else {
-		if (config_stats) {
-			old_size = isalloc(p, false);
-			if (config_valgrind && opt_valgrind)
-				old_rzsize = u2rz(old_size);
-		} else if (config_valgrind && opt_valgrind) {
-			old_size = isalloc(p, false);
-			old_rzsize = u2rz(old_size);
-		}
-		q = irallocx(p, size, extra, alignment, zero, no_move,
-		    try_tcache_alloc, try_tcache_dalloc, arena);
-		if (q == NULL)
-			goto label_err;
-		if (config_stats)
-			usize = isalloc(q, config_prof);
-		if (rsize != NULL) {
-			if (config_stats == false)
-				usize = isalloc(q, config_prof);
-			*rsize = usize;
-		}
+		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
+		    zero, arena);
 	}
+	if (usize == old_usize)
+		goto label_not_resized;
 
-	*ptr = q;
 	if (config_stats) {
 		thread_allocated_t *ta;
 		ta = thread_allocated_tsd_get();
 		ta->allocated += usize;
-		ta->deallocated += old_size;
-	}
-	UTRACE(p, size, q);
-	JEMALLOC_VALGRIND_REALLOC(q, usize, p, old_size, old_rzsize, zero);
-	return (ALLOCM_SUCCESS);
-label_err:
-	if (no_move) {
-		UTRACE(p, size, q);
-		return (ALLOCM_ERR_NOT_MOVED);
-	}
-label_oom:
-	if (config_xmalloc && opt_xmalloc) {
-		malloc_write("<jemalloc>: Error in rallocm(): "
-		    "out of memory\n");
-		abort();
+		ta->deallocated += old_usize;
 	}
-	UTRACE(p, size, 0);
-	return (ALLOCM_ERR_OOM);
+	JEMALLOC_VALGRIND_REALLOC(ptr, usize, ptr, old_usize, old_rzsize, zero);
+label_not_resized:
+	UTRACE(ptr, size, ptr);
+	return (usize);
 }
 
-int
-je_sallocm(const void *ptr, size_t *rsize, int flags)
+size_t
+je_sallocx(const void *ptr, int flags)
 {
-	size_t sz;
+	size_t usize;
 
 	assert(malloc_initialized || IS_INITIALIZER);
 	malloc_thread_init();
 
 	if (config_ivsalloc)
-		sz = ivsalloc(ptr, config_prof);
+		usize = ivsalloc(ptr, config_prof);
 	else {
 		assert(ptr != NULL);
-		sz = isalloc(ptr, config_prof);
+		usize = isalloc(ptr, config_prof);
 	}
-	assert(rsize != NULL);
-	*rsize = sz;
 
-	return (ALLOCM_SUCCESS);
+	return (usize);
 }
 
-int
-je_dallocm(void *ptr, int flags)
+void
+je_dallocx(void *ptr, int flags)
 {
 	size_t usize;
-	size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
+	UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
 	unsigned arena_ind = ((unsigned)(flags >> 8)) - 1;
 	bool try_tcache;
 
@@ -1677,28 +1786,162 @@ je_dallocm(void *ptr, int flags)
 		thread_allocated_tsd_get()->deallocated += usize;
 	if (config_valgrind && opt_valgrind)
 		rzsize = p2rz(ptr);
-	iqallocx(ptr, try_tcache);
+	iqalloct(ptr, try_tcache);
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
-
-	return (ALLOCM_SUCCESS);
 }
 
-int
-je_nallocm(size_t *rsize, size_t size, int flags)
+size_t
+je_nallocx(size_t size, int flags)
 {
 	size_t usize;
-	size_t alignment = (ZU(1) << (flags & ALLOCM_LG_ALIGN_MASK)
+	size_t alignment = (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK)
 	    & (SIZE_T_MAX-1));
 
 	assert(size != 0);
 
 	if (malloc_init())
-		return (ALLOCM_ERR_OOM);
+		return (0);
 
 	usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
-	if (usize == 0)
+	assert(usize != 0);
+	return (usize);
+}
+
+int
+je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen)
+{
+
+	if (malloc_init())
+		return (EAGAIN);
+
+	return (ctl_byname(name, oldp, oldlenp, newp, newlen));
+}
+
+int
+je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp)
+{
+
+	if (malloc_init())
+		return (EAGAIN);
+
+	return (ctl_nametomib(name, mibp, miblenp));
+}
+
+int
+je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+  void *newp, size_t newlen)
+{
+
+	if (malloc_init())
+		return (EAGAIN);
+
+	return (ctl_bymib(mib, miblen, oldp, oldlenp, newp, newlen));
+}
+
+void
+je_malloc_stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
+    const char *opts)
+{
+
+	stats_print(write_cb, cbopaque, opts);
+}
+
+size_t
+je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr)
+{
+	size_t ret;
+
+	assert(malloc_initialized || IS_INITIALIZER);
+	malloc_thread_init();
+
+	if (config_ivsalloc)
+		ret = ivsalloc(ptr, config_prof);
+	else
+		ret = (ptr != NULL) ? isalloc(ptr, config_prof) : 0;
+
+	return (ret);
+}
+
+/*
+ * End non-standard functions.
+ */
+/******************************************************************************/
+/*
+ * Begin experimental functions.
+ */
+#ifdef JEMALLOC_EXPERIMENTAL
+
+int
+je_allocm(void **ptr, size_t *rsize, size_t size, int flags)
+{
+	void *p;
+
+	assert(ptr != NULL);
+
+	p = je_mallocx(size, flags);
+	if (p == NULL)
 		return (ALLOCM_ERR_OOM);
+	if (rsize != NULL)
+		*rsize = isalloc(p, config_prof);
+	*ptr = p;
+	return (ALLOCM_SUCCESS);
+}
+
+int
+je_rallocm(void **ptr, size_t *rsize, size_t size, size_t extra, int flags)
+{
+	int ret;
+	bool no_move = flags & ALLOCM_NO_MOVE;
+
+	assert(ptr != NULL);
+	assert(*ptr != NULL);
+	assert(size != 0);
+	assert(SIZE_T_MAX - size >= extra);
+
+	if (no_move) {
+		size_t usize = je_xallocx(*ptr, size, extra, flags);
+		ret = (usize >= size) ? ALLOCM_SUCCESS : ALLOCM_ERR_NOT_MOVED;
+		if (rsize != NULL)
+			*rsize = usize;
+	} else {
+		void *p = je_rallocx(*ptr, size+extra, flags);
+		if (p != NULL) {
+			*ptr = p;
+			ret = ALLOCM_SUCCESS;
+		} else
+			ret = ALLOCM_ERR_OOM;
+		if (rsize != NULL)
+			*rsize = isalloc(*ptr, config_prof);
+	}
+	return (ret);
+}
+
+int
+je_sallocm(const void *ptr, size_t *rsize, int flags)
+{
 
+	assert(rsize != NULL);
+	*rsize = je_sallocx(ptr, flags);
+	return (ALLOCM_SUCCESS);
+}
+
+int
+je_dallocm(void *ptr, int flags)
+{
+
+	je_dallocx(ptr, flags);
+	return (ALLOCM_SUCCESS);
+}
+
+int
+je_nallocm(size_t *rsize, size_t size, int flags)
+{
+	size_t usize;
+
+	usize = je_nallocx(size, flags);
+	if (usize == 0)
+		return (ALLOCM_ERR_OOM);
 	if (rsize != NULL)
 		*rsize = usize;
 	return (ALLOCM_SUCCESS);
diff --git a/src/mutex.c b/src/mutex.c
index 55e18c23..788eca38 100644
--- a/src/mutex.c
+++ b/src/mutex.c
@@ -6,7 +6,7 @@
 #endif
 
 #ifndef _CRT_SPINCOUNT
-#define _CRT_SPINCOUNT 4000
+#define	_CRT_SPINCOUNT 4000
 #endif
 
 /******************************************************************************/
diff --git a/src/prof.c b/src/prof.c
index c133b95c..1d8ccbd6 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -24,7 +24,12 @@ bool		opt_prof_gdump = false;
 bool		opt_prof_final = true;
 bool		opt_prof_leak = false;
 bool		opt_prof_accum = false;
-char		opt_prof_prefix[PATH_MAX + 1];
+char		opt_prof_prefix[
+    /* Minimize memory bloat for non-prof builds. */
+#ifdef JEMALLOC_PROF
+    PATH_MAX +
+#endif
+    1];
 
 uint64_t	prof_interval = 0;
 bool		prof_promote;
@@ -54,10 +59,17 @@ static uint64_t		prof_dump_useq;
 
 /*
  * This buffer is rather large for stack allocation, so use a single buffer for
- * all profile dumps.  The buffer is implicitly protected by bt2ctx_mtx, since
- * it must be locked anyway during dumping.
+ * all profile dumps.
  */
-static char		prof_dump_buf[PROF_DUMP_BUFSIZE];
+static malloc_mutex_t	prof_dump_mtx;
+static char		prof_dump_buf[
+    /* Minimize memory bloat for non-prof builds. */
+#ifdef JEMALLOC_PROF
+    PROF_DUMP_BUFSIZE
+#else
+    1
+#endif
+];
 static unsigned		prof_dump_buf_end;
 static int		prof_dump_fd;
 
@@ -65,36 +77,6 @@ static int		prof_dump_fd;
 static bool		prof_booted = false;
 
 /******************************************************************************/
-/* Function prototypes for non-inline static functions. */
-
-static prof_bt_t	*bt_dup(prof_bt_t *bt);
-static void	bt_destroy(prof_bt_t *bt);
-#ifdef JEMALLOC_PROF_LIBGCC
-static _Unwind_Reason_Code	prof_unwind_init_callback(
-    struct _Unwind_Context *context, void *arg);
-static _Unwind_Reason_Code	prof_unwind_callback(
-    struct _Unwind_Context *context, void *arg);
-#endif
-static bool	prof_flush(bool propagate_err);
-static bool	prof_write(bool propagate_err, const char *s);
-static bool	prof_printf(bool propagate_err, const char *format, ...)
-    JEMALLOC_ATTR(format(printf, 2, 3));
-static void	prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all,
-    size_t *leak_nctx);
-static void	prof_ctx_destroy(prof_ctx_t *ctx);
-static void	prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt);
-static bool	prof_dump_ctx(bool propagate_err, prof_ctx_t *ctx,
-    prof_bt_t *bt);
-static bool	prof_dump_maps(bool propagate_err);
-static bool	prof_dump(bool propagate_err, const char *filename,
-    bool leakcheck);
-static void	prof_dump_filename(char *filename, char v, int64_t vseq);
-static void	prof_fdump(void);
-static void	prof_bt_hash(const void *key, size_t r_hash[2]);
-static bool	prof_bt_keycomp(const void *k1, const void *k2);
-static malloc_mutex_t	*prof_ctx_mutex_choose(void);
-
-/******************************************************************************/
 
 void
 bt_init(prof_bt_t *bt, void **vec)
@@ -423,10 +405,169 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore)
 {
 
 	cassert(config_prof);
-	assert(false);
+	not_reached();
 }
 #endif
 
+static malloc_mutex_t *
+prof_ctx_mutex_choose(void)
+{
+	unsigned nctxs = atomic_add_u(&cum_ctxs, 1);
+
+	return (&ctx_locks[(nctxs - 1) % PROF_NCTX_LOCKS]);
+}
+
+static void
+prof_ctx_init(prof_ctx_t *ctx, prof_bt_t *bt)
+{
+
+	ctx->bt = bt;
+	ctx->lock = prof_ctx_mutex_choose();
+	/*
+	 * Set nlimbo to 1, in order to avoid a race condition with
+	 * prof_ctx_merge()/prof_ctx_destroy().
+	 */
+	ctx->nlimbo = 1;
+	ql_elm_new(ctx, dump_link);
+	memset(&ctx->cnt_merged, 0, sizeof(prof_cnt_t));
+	ql_new(&ctx->cnts_ql);
+}
+
+static void
+prof_ctx_destroy(prof_ctx_t *ctx)
+{
+	prof_tdata_t *prof_tdata;
+
+	cassert(config_prof);
+
+	/*
+	 * Check that ctx is still unused by any thread cache before destroying
+	 * it.  prof_lookup() increments ctx->nlimbo in order to avoid a race
+	 * condition with this function, as does prof_ctx_merge() in order to
+	 * avoid a race between the main body of prof_ctx_merge() and entry
+	 * into this function.
+	 */
+	prof_tdata = prof_tdata_get(false);
+	assert((uintptr_t)prof_tdata > (uintptr_t)PROF_TDATA_STATE_MAX);
+	prof_enter(prof_tdata);
+	malloc_mutex_lock(ctx->lock);
+	if (ql_first(&ctx->cnts_ql) == NULL && ctx->cnt_merged.curobjs == 0 &&
+	    ctx->nlimbo == 1) {
+		assert(ctx->cnt_merged.curbytes == 0);
+		assert(ctx->cnt_merged.accumobjs == 0);
+		assert(ctx->cnt_merged.accumbytes == 0);
+		/* Remove ctx from bt2ctx. */
+		if (ckh_remove(&bt2ctx, ctx->bt, NULL, NULL))
+			not_reached();
+		prof_leave(prof_tdata);
+		/* Destroy ctx. */
+		malloc_mutex_unlock(ctx->lock);
+		bt_destroy(ctx->bt);
+		idalloc(ctx);
+	} else {
+		/*
+		 * Compensate for increment in prof_ctx_merge() or
+		 * prof_lookup().
+		 */
+		ctx->nlimbo--;
+		malloc_mutex_unlock(ctx->lock);
+		prof_leave(prof_tdata);
+	}
+}
+
+static void
+prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt)
+{
+	bool destroy;
+
+	cassert(config_prof);
+
+	/* Merge cnt stats and detach from ctx. */
+	malloc_mutex_lock(ctx->lock);
+	ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
+	ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
+	ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
+	ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
+	ql_remove(&ctx->cnts_ql, cnt, cnts_link);
+	if (opt_prof_accum == false && ql_first(&ctx->cnts_ql) == NULL &&
+	    ctx->cnt_merged.curobjs == 0 && ctx->nlimbo == 0) {
+		/*
+		 * Increment ctx->nlimbo in order to keep another thread from
+		 * winning the race to destroy ctx while this one has ctx->lock
+		 * dropped.  Without this, it would be possible for another
+		 * thread to:
+		 *
+		 * 1) Sample an allocation associated with ctx.
+		 * 2) Deallocate the sampled object.
+		 * 3) Successfully prof_ctx_destroy(ctx).
+		 *
+		 * The result would be that ctx no longer exists by the time
+		 * this thread accesses it in prof_ctx_destroy().
+		 */
+		ctx->nlimbo++;
+		destroy = true;
+	} else
+		destroy = false;
+	malloc_mutex_unlock(ctx->lock);
+	if (destroy)
+		prof_ctx_destroy(ctx);
+}
+
+static bool
+prof_lookup_global(prof_bt_t *bt, prof_tdata_t *prof_tdata, void **p_btkey,
+    prof_ctx_t **p_ctx, bool *p_new_ctx)
+{
+	union {
+		prof_ctx_t	*p;
+		void		*v;
+	} ctx;
+	union {
+		prof_bt_t	*p;
+		void		*v;
+	} btkey;
+	bool new_ctx;
+
+	prof_enter(prof_tdata);
+	if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) {
+		/* bt has never been seen before.  Insert it. */
+		ctx.v = imalloc(sizeof(prof_ctx_t));
+		if (ctx.v == NULL) {
+			prof_leave(prof_tdata);
+			return (true);
+		}
+		btkey.p = bt_dup(bt);
+		if (btkey.v == NULL) {
+			prof_leave(prof_tdata);
+			idalloc(ctx.v);
+			return (true);
+		}
+		prof_ctx_init(ctx.p, btkey.p);
+		if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) {
+			/* OOM. */
+			prof_leave(prof_tdata);
+			idalloc(btkey.v);
+			idalloc(ctx.v);
+			return (true);
+		}
+		new_ctx = true;
+	} else {
+		/*
+		 * Increment nlimbo, in order to avoid a race condition with
+		 * prof_ctx_merge()/prof_ctx_destroy().
+		 */
+		malloc_mutex_lock(ctx.p->lock);
+		ctx.p->nlimbo++;
+		malloc_mutex_unlock(ctx.p->lock);
+		new_ctx = false;
+	}
+	prof_leave(prof_tdata);
+
+	*p_btkey = btkey.v;
+	*p_ctx = ctx.p;
+	*p_new_ctx = new_ctx;
+	return (false);
+}
+
 prof_thr_cnt_t *
 prof_lookup(prof_bt_t *bt)
 {
@@ -443,62 +584,16 @@ prof_lookup(prof_bt_t *bt)
 		return (NULL);
 
 	if (ckh_search(&prof_tdata->bt2cnt, bt, NULL, &ret.v)) {
-		union {
-			prof_bt_t	*p;
-			void		*v;
-		} btkey;
-		union {
-			prof_ctx_t	*p;
-			void		*v;
-		} ctx;
+		void *btkey;
+		prof_ctx_t *ctx;
 		bool new_ctx;
 
 		/*
 		 * This thread's cache lacks bt.  Look for it in the global
 		 * cache.
 		 */
-		prof_enter(prof_tdata);
-		if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) {
-			/* bt has never been seen before.  Insert it. */
-			ctx.v = imalloc(sizeof(prof_ctx_t));
-			if (ctx.v == NULL) {
-				prof_leave(prof_tdata);
-				return (NULL);
-			}
-			btkey.p = bt_dup(bt);
-			if (btkey.v == NULL) {
-				prof_leave(prof_tdata);
-				idalloc(ctx.v);
-				return (NULL);
-			}
-			ctx.p->bt = btkey.p;
-			ctx.p->lock = prof_ctx_mutex_choose();
-			/*
-			 * Set nlimbo to 1, in order to avoid a race condition
-			 * with prof_ctx_merge()/prof_ctx_destroy().
-			 */
-			ctx.p->nlimbo = 1;
-			memset(&ctx.p->cnt_merged, 0, sizeof(prof_cnt_t));
-			ql_new(&ctx.p->cnts_ql);
-			if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) {
-				/* OOM. */
-				prof_leave(prof_tdata);
-				idalloc(btkey.v);
-				idalloc(ctx.v);
-				return (NULL);
-			}
-			new_ctx = true;
-		} else {
-			/*
-			 * Increment nlimbo, in order to avoid a race condition
-			 * with prof_ctx_merge()/prof_ctx_destroy().
-			 */
-			malloc_mutex_lock(ctx.p->lock);
-			ctx.p->nlimbo++;
-			malloc_mutex_unlock(ctx.p->lock);
-			new_ctx = false;
-		}
-		prof_leave(prof_tdata);
+		if (prof_lookup_global(bt, prof_tdata, &btkey, &ctx, &new_ctx))
+			return (NULL);
 
 		/* Link a prof_thd_cnt_t into ctx for this thread. */
 		if (ckh_count(&prof_tdata->bt2cnt) == PROF_TCMAX) {
@@ -511,7 +606,7 @@ prof_lookup(prof_bt_t *bt)
 			assert(ret.v != NULL);
 			if (ckh_remove(&prof_tdata->bt2cnt, ret.p->ctx->bt,
 			    NULL, NULL))
-				assert(false);
+				not_reached();
 			ql_remove(&prof_tdata->lru_ql, ret.p, lru_link);
 			prof_ctx_merge(ret.p->ctx, ret.p);
 			/* ret can now be re-used. */
@@ -521,27 +616,27 @@ prof_lookup(prof_bt_t *bt)
 			ret.v = imalloc(sizeof(prof_thr_cnt_t));
 			if (ret.p == NULL) {
 				if (new_ctx)
-					prof_ctx_destroy(ctx.p);
+					prof_ctx_destroy(ctx);
 				return (NULL);
 			}
 			ql_elm_new(ret.p, cnts_link);
 			ql_elm_new(ret.p, lru_link);
 		}
 		/* Finish initializing ret. */
-		ret.p->ctx = ctx.p;
+		ret.p->ctx = ctx;
 		ret.p->epoch = 0;
 		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
-		if (ckh_insert(&prof_tdata->bt2cnt, btkey.v, ret.v)) {
+		if (ckh_insert(&prof_tdata->bt2cnt, btkey, ret.v)) {
 			if (new_ctx)
-				prof_ctx_destroy(ctx.p);
+				prof_ctx_destroy(ctx);
 			idalloc(ret.v);
 			return (NULL);
 		}
 		ql_head_insert(&prof_tdata->lru_ql, ret.p, lru_link);
-		malloc_mutex_lock(ctx.p->lock);
-		ql_tail_insert(&ctx.p->cnts_ql, ret.p, cnts_link);
-		ctx.p->nlimbo--;
-		malloc_mutex_unlock(ctx.p->lock);
+		malloc_mutex_lock(ctx->lock);
+		ql_tail_insert(&ctx->cnts_ql, ret.p, cnts_link);
+		ctx->nlimbo--;
+		malloc_mutex_unlock(ctx->lock);
 	} else {
 		/* Move ret to the front of the LRU. */
 		ql_remove(&prof_tdata->lru_ql, ret.p, lru_link);
@@ -551,8 +646,52 @@ prof_lookup(prof_bt_t *bt)
 	return (ret.p);
 }
 
+#ifdef JEMALLOC_JET
+size_t
+prof_bt_count(void)
+{
+	size_t bt_count;
+	prof_tdata_t *prof_tdata;
+
+	prof_tdata = prof_tdata_get(false);
+	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+		return (0);
+
+	prof_enter(prof_tdata);
+	bt_count = ckh_count(&bt2ctx);
+	prof_leave(prof_tdata);
+
+	return (bt_count);
+}
+#endif
+
+#ifdef JEMALLOC_JET
+#undef prof_dump_open
+#define	prof_dump_open JEMALLOC_N(prof_dump_open_impl)
+#endif
+static int
+prof_dump_open(bool propagate_err, const char *filename)
+{
+	int fd;
+
+	fd = creat(filename, 0644);
+	if (fd == -1 && propagate_err == false) {
+		malloc_printf("<jemalloc>: creat(\"%s\"), 0644) failed\n",
+		    filename);
+		if (opt_abort)
+			abort();
+	}
+
+	return (fd);
+}
+#ifdef JEMALLOC_JET
+#undef prof_dump_open
+#define	prof_dump_open JEMALLOC_N(prof_dump_open)
+prof_dump_open_t *prof_dump_open = JEMALLOC_N(prof_dump_open_impl);
+#endif
+
 static bool
-prof_flush(bool propagate_err)
+prof_dump_flush(bool propagate_err)
 {
 	bool ret = false;
 	ssize_t err;
@@ -575,7 +714,20 @@ prof_flush(bool propagate_err)
 }
 
 static bool
-prof_write(bool propagate_err, const char *s)
+prof_dump_close(bool propagate_err)
+{
+	bool ret;
+
+	assert(prof_dump_fd != -1);
+	ret = prof_dump_flush(propagate_err);
+	close(prof_dump_fd);
+	prof_dump_fd = -1;
+
+	return (ret);
+}
+
+static bool
+prof_dump_write(bool propagate_err, const char *s)
 {
 	unsigned i, slen, n;
 
@@ -586,7 +738,7 @@ prof_write(bool propagate_err, const char *s)
 	while (i < slen) {
 		/* Flush the buffer if it is full. */
 		if (prof_dump_buf_end == PROF_DUMP_BUFSIZE)
-			if (prof_flush(propagate_err) && propagate_err)
+			if (prof_dump_flush(propagate_err) && propagate_err)
 				return (true);
 
 		if (prof_dump_buf_end + slen <= PROF_DUMP_BUFSIZE) {
@@ -606,7 +758,7 @@ prof_write(bool propagate_err, const char *s)
 
 JEMALLOC_ATTR(format(printf, 2, 3))
 static bool
-prof_printf(bool propagate_err, const char *format, ...)
+prof_dump_printf(bool propagate_err, const char *format, ...)
 {
 	bool ret;
 	va_list ap;
@@ -615,13 +767,14 @@ prof_printf(bool propagate_err, const char *format, ...)
 	va_start(ap, format);
 	malloc_vsnprintf(buf, sizeof(buf), format, ap);
 	va_end(ap);
-	ret = prof_write(propagate_err, buf);
+	ret = prof_dump_write(propagate_err, buf);
 
 	return (ret);
 }
 
 static void
-prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
+prof_dump_ctx_prep(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx,
+    prof_ctx_list_t *ctx_ql)
 {
 	prof_thr_cnt_t *thr_cnt;
 	prof_cnt_t tcnt;
@@ -630,6 +783,14 @@ prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
 
 	malloc_mutex_lock(ctx->lock);
 
+	/*
+	 * Increment nlimbo so that ctx won't go away before dump.
+	 * Additionally, link ctx into the dump list so that it is included in
+	 * prof_dump()'s second pass.
+	 */
+	ctx->nlimbo++;
+	ql_tail_insert(ctx_ql, ctx, dump_link);
+
 	memcpy(&ctx->cnt_summed, &ctx->cnt_merged, sizeof(prof_cnt_t));
 	ql_foreach(thr_cnt, &ctx->cnts_ql, cnts_link) {
 		volatile unsigned *epoch = &thr_cnt->epoch;
@@ -670,89 +831,52 @@ prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
 	malloc_mutex_unlock(ctx->lock);
 }
 
-static void
-prof_ctx_destroy(prof_ctx_t *ctx)
+static bool
+prof_dump_header(bool propagate_err, const prof_cnt_t *cnt_all)
 {
-	prof_tdata_t *prof_tdata;
 
-	cassert(config_prof);
-
-	/*
-	 * Check that ctx is still unused by any thread cache before destroying
-	 * it.  prof_lookup() increments ctx->nlimbo in order to avoid a race
-	 * condition with this function, as does prof_ctx_merge() in order to
-	 * avoid a race between the main body of prof_ctx_merge() and entry
-	 * into this function.
-	 */
-	prof_tdata = prof_tdata_get(false);
-	assert((uintptr_t)prof_tdata > (uintptr_t)PROF_TDATA_STATE_MAX);
-	prof_enter(prof_tdata);
-	malloc_mutex_lock(ctx->lock);
-	if (ql_first(&ctx->cnts_ql) == NULL && ctx->cnt_merged.curobjs == 0 &&
-	    ctx->nlimbo == 1) {
-		assert(ctx->cnt_merged.curbytes == 0);
-		assert(ctx->cnt_merged.accumobjs == 0);
-		assert(ctx->cnt_merged.accumbytes == 0);
-		/* Remove ctx from bt2ctx. */
-		if (ckh_remove(&bt2ctx, ctx->bt, NULL, NULL))
-			assert(false);
-		prof_leave(prof_tdata);
-		/* Destroy ctx. */
-		malloc_mutex_unlock(ctx->lock);
-		bt_destroy(ctx->bt);
-		idalloc(ctx);
+	if (opt_lg_prof_sample == 0) {
+		if (prof_dump_printf(propagate_err,
+		    "heap profile: %"PRId64": %"PRId64
+		    " [%"PRIu64": %"PRIu64"] @ heapprofile\n",
+		    cnt_all->curobjs, cnt_all->curbytes,
+		    cnt_all->accumobjs, cnt_all->accumbytes))
+			return (true);
 	} else {
-		/*
-		 * Compensate for increment in prof_ctx_merge() or
-		 * prof_lookup().
-		 */
-		ctx->nlimbo--;
-		malloc_mutex_unlock(ctx->lock);
-		prof_leave(prof_tdata);
+		if (prof_dump_printf(propagate_err,
+		    "heap profile: %"PRId64": %"PRId64
+		    " [%"PRIu64": %"PRIu64"] @ heap_v2/%"PRIu64"\n",
+		    cnt_all->curobjs, cnt_all->curbytes,
+		    cnt_all->accumobjs, cnt_all->accumbytes,
+		    ((uint64_t)1U << opt_lg_prof_sample)))
+			return (true);
 	}
+
+	return (false);
 }
 
 static void
-prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt)
+prof_dump_ctx_cleanup_locked(prof_ctx_t *ctx, prof_ctx_list_t *ctx_ql)
 {
-	bool destroy;
 
-	cassert(config_prof);
+	ctx->nlimbo--;
+	ql_remove(ctx_ql, ctx, dump_link);
+}
+
+static void
+prof_dump_ctx_cleanup(prof_ctx_t *ctx, prof_ctx_list_t *ctx_ql)
+{
 
-	/* Merge cnt stats and detach from ctx. */
 	malloc_mutex_lock(ctx->lock);
-	ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
-	ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
-	ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
-	ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
-	ql_remove(&ctx->cnts_ql, cnt, cnts_link);
-	if (opt_prof_accum == false && ql_first(&ctx->cnts_ql) == NULL &&
-	    ctx->cnt_merged.curobjs == 0 && ctx->nlimbo == 0) {
-		/*
-		 * Increment ctx->nlimbo in order to keep another thread from
-		 * winning the race to destroy ctx while this one has ctx->lock
-		 * dropped.  Without this, it would be possible for another
-		 * thread to:
-		 *
-		 * 1) Sample an allocation associated with ctx.
-		 * 2) Deallocate the sampled object.
-		 * 3) Successfully prof_ctx_destroy(ctx).
-		 *
-		 * The result would be that ctx no longer exists by the time
-		 * this thread accesses it in prof_ctx_destroy().
-		 */
-		ctx->nlimbo++;
-		destroy = true;
-	} else
-		destroy = false;
+	prof_dump_ctx_cleanup_locked(ctx, ctx_ql);
 	malloc_mutex_unlock(ctx->lock);
-	if (destroy)
-		prof_ctx_destroy(ctx);
 }
 
 static bool
-prof_dump_ctx(bool propagate_err, prof_ctx_t *ctx, prof_bt_t *bt)
+prof_dump_ctx(bool propagate_err, prof_ctx_t *ctx, const prof_bt_t *bt,
+    prof_ctx_list_t *ctx_ql)
 {
+	bool ret;
 	unsigned i;
 
 	cassert(config_prof);
@@ -764,36 +888,49 @@ prof_dump_ctx(bool propagate_err, prof_ctx_t *ctx, prof_bt_t *bt)
 	 * filled in.  Avoid dumping any ctx that is an artifact of either
 	 * implementation detail.
 	 */
+	malloc_mutex_lock(ctx->lock);
 	if ((opt_prof_accum == false && ctx->cnt_summed.curobjs == 0) ||
 	    (opt_prof_accum && ctx->cnt_summed.accumobjs == 0)) {
 		assert(ctx->cnt_summed.curobjs == 0);
 		assert(ctx->cnt_summed.curbytes == 0);
 		assert(ctx->cnt_summed.accumobjs == 0);
 		assert(ctx->cnt_summed.accumbytes == 0);
-		return (false);
+		ret = false;
+		goto label_return;
 	}
 
-	if (prof_printf(propagate_err, "%"PRId64": %"PRId64
+	if (prof_dump_printf(propagate_err, "%"PRId64": %"PRId64
 	    " [%"PRIu64": %"PRIu64"] @",
 	    ctx->cnt_summed.curobjs, ctx->cnt_summed.curbytes,
-	    ctx->cnt_summed.accumobjs, ctx->cnt_summed.accumbytes))
-		return (true);
+	    ctx->cnt_summed.accumobjs, ctx->cnt_summed.accumbytes)) {
+		ret = true;
+		goto label_return;
+	}
 
 	for (i = 0; i < bt->len; i++) {
-		if (prof_printf(propagate_err, " %#"PRIxPTR,
-		    (uintptr_t)bt->vec[i]))
-			return (true);
+		if (prof_dump_printf(propagate_err, " %#"PRIxPTR,
+		    (uintptr_t)bt->vec[i])) {
+			ret = true;
+			goto label_return;
+		}
 	}
 
-	if (prof_write(propagate_err, "\n"))
-		return (true);
+	if (prof_dump_write(propagate_err, "\n")) {
+		ret = true;
+		goto label_return;
+	}
 
-	return (false);
+	ret = false;
+label_return:
+	prof_dump_ctx_cleanup_locked(ctx, ctx_ql);
+	malloc_mutex_unlock(ctx->lock);
+	return (ret);
 }
 
 static bool
 prof_dump_maps(bool propagate_err)
 {
+	bool ret;
 	int mfd;
 	char filename[PATH_MAX + 1];
 
@@ -805,25 +942,52 @@ prof_dump_maps(bool propagate_err)
 	if (mfd != -1) {
 		ssize_t nread;
 
-		if (prof_write(propagate_err, "\nMAPPED_LIBRARIES:\n") &&
-		    propagate_err)
-			return (true);
+		if (prof_dump_write(propagate_err, "\nMAPPED_LIBRARIES:\n") &&
+		    propagate_err) {
+			ret = true;
+			goto label_return;
+		}
 		nread = 0;
 		do {
 			prof_dump_buf_end += nread;
 			if (prof_dump_buf_end == PROF_DUMP_BUFSIZE) {
 				/* Make space in prof_dump_buf before read(). */
-				if (prof_flush(propagate_err) && propagate_err)
-					return (true);
+				if (prof_dump_flush(propagate_err) &&
+				    propagate_err) {
+					ret = true;
+					goto label_return;
+				}
 			}
 			nread = read(mfd, &prof_dump_buf[prof_dump_buf_end],
 			    PROF_DUMP_BUFSIZE - prof_dump_buf_end);
 		} while (nread > 0);
+	} else {
+		ret = true;
+		goto label_return;
+	}
+
+	ret = false;
+label_return:
+	if (mfd != -1)
 		close(mfd);
-	} else
-		return (true);
+	return (ret);
+}
 
-	return (false);
+static void
+prof_leakcheck(const prof_cnt_t *cnt_all, size_t leak_nctx,
+    const char *filename)
+{
+
+	if (cnt_all->curbytes != 0) {
+		malloc_printf("<jemalloc>: Leak summary: %"PRId64" byte%s, %"
+		    PRId64" object%s, %zu context%s\n",
+		    cnt_all->curbytes, (cnt_all->curbytes != 1) ? "s" : "",
+		    cnt_all->curobjs, (cnt_all->curobjs != 1) ? "s" : "",
+		    leak_nctx, (leak_nctx != 1) ? "s" : "");
+		malloc_printf(
+		    "<jemalloc>: Run pprof on \"%s\" for leak detail\n",
+		    filename);
+	}
 }
 
 static bool
@@ -833,98 +997,74 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 	prof_cnt_t cnt_all;
 	size_t tabind;
 	union {
-		prof_bt_t	*p;
-		void		*v;
-	} bt;
-	union {
 		prof_ctx_t	*p;
 		void		*v;
 	} ctx;
 	size_t leak_nctx;
+	prof_ctx_list_t ctx_ql;
 
 	cassert(config_prof);
 
 	prof_tdata = prof_tdata_get(false);
 	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return (true);
-	prof_enter(prof_tdata);
-	prof_dump_fd = creat(filename, 0644);
-	if (prof_dump_fd == -1) {
-		if (propagate_err == false) {
-			malloc_printf(
-			    "<jemalloc>: creat(\"%s\"), 0644) failed\n",
-			    filename);
-			if (opt_abort)
-				abort();
-		}
-		goto label_error;
-	}
+
+	malloc_mutex_lock(&prof_dump_mtx);
 
 	/* Merge per thread profile stats, and sum them in cnt_all. */
 	memset(&cnt_all, 0, sizeof(prof_cnt_t));
 	leak_nctx = 0;
+	ql_new(&ctx_ql);
+	prof_enter(prof_tdata);
 	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v) == false;)
-		prof_ctx_sum(ctx.p, &cnt_all, &leak_nctx);
+		prof_dump_ctx_prep(ctx.p, &cnt_all, &leak_nctx, &ctx_ql);
+	prof_leave(prof_tdata);
+
+	/* Create dump file. */
+	if ((prof_dump_fd = prof_dump_open(propagate_err, filename)) == -1)
+		goto label_open_close_error;
 
 	/* Dump profile header. */
-	if (opt_lg_prof_sample == 0) {
-		if (prof_printf(propagate_err,
-		    "heap profile: %"PRId64": %"PRId64
-		    " [%"PRIu64": %"PRIu64"] @ heapprofile\n",
-		    cnt_all.curobjs, cnt_all.curbytes,
-		    cnt_all.accumobjs, cnt_all.accumbytes))
-			goto label_error;
-	} else {
-		if (prof_printf(propagate_err,
-		    "heap profile: %"PRId64": %"PRId64
-		    " [%"PRIu64": %"PRIu64"] @ heap_v2/%"PRIu64"\n",
-		    cnt_all.curobjs, cnt_all.curbytes,
-		    cnt_all.accumobjs, cnt_all.accumbytes,
-		    ((uint64_t)1U << opt_lg_prof_sample)))
-			goto label_error;
-	}
+	if (prof_dump_header(propagate_err, &cnt_all))
+		goto label_write_error;
 
-	/* Dump  per ctx profile stats. */
-	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, &bt.v, &ctx.v)
-	    == false;) {
-		if (prof_dump_ctx(propagate_err, ctx.p, bt.p))
-			goto label_error;
+	/* Dump per ctx profile stats. */
+	while ((ctx.p = ql_first(&ctx_ql)) != NULL) {
+		if (prof_dump_ctx(propagate_err, ctx.p, ctx.p->bt, &ctx_ql))
+			goto label_write_error;
 	}
 
 	/* Dump /proc/<pid>/maps if possible. */
 	if (prof_dump_maps(propagate_err))
-		goto label_error;
+		goto label_write_error;
 
-	if (prof_flush(propagate_err))
-		goto label_error;
-	close(prof_dump_fd);
-	prof_leave(prof_tdata);
+	if (prof_dump_close(propagate_err))
+		goto label_open_close_error;
 
-	if (leakcheck && cnt_all.curbytes != 0) {
-		malloc_printf("<jemalloc>: Leak summary: %"PRId64" byte%s, %"
-		    PRId64" object%s, %zu context%s\n",
-		    cnt_all.curbytes, (cnt_all.curbytes != 1) ? "s" : "",
-		    cnt_all.curobjs, (cnt_all.curobjs != 1) ? "s" : "",
-		    leak_nctx, (leak_nctx != 1) ? "s" : "");
-		malloc_printf(
-		    "<jemalloc>: Run pprof on \"%s\" for leak detail\n",
-		    filename);
-	}
+	malloc_mutex_unlock(&prof_dump_mtx);
+
+	if (leakcheck)
+		prof_leakcheck(&cnt_all, leak_nctx, filename);
 
 	return (false);
-label_error:
-	prof_leave(prof_tdata);
+label_write_error:
+	prof_dump_close(propagate_err);
+label_open_close_error:
+	while ((ctx.p = ql_first(&ctx_ql)) != NULL)
+		prof_dump_ctx_cleanup(ctx.p, &ctx_ql);
+	malloc_mutex_unlock(&prof_dump_mtx);
 	return (true);
 }
 
 #define	DUMP_FILENAME_BUFSIZE	(PATH_MAX + 1)
+#define	VSEQ_INVALID		UINT64_C(0xffffffffffffffff)
 static void
 prof_dump_filename(char *filename, char v, int64_t vseq)
 {
 
 	cassert(config_prof);
 
-	if (vseq != UINT64_C(0xffffffffffffffff)) {
+	if (vseq != VSEQ_INVALID) {
 	        /* "<prefix>.<pid>.<seq>.v<vseq>.heap" */
 		malloc_snprintf(filename, DUMP_FILENAME_BUFSIZE,
 		    "%s.%d.%"PRIu64".%c%"PRId64".heap",
@@ -950,7 +1090,7 @@ prof_fdump(void)
 
 	if (opt_prof_final && opt_prof_prefix[0] != '\0') {
 		malloc_mutex_lock(&prof_dump_seq_mtx);
-		prof_dump_filename(filename, 'f', UINT64_C(0xffffffffffffffff));
+		prof_dump_filename(filename, 'f', VSEQ_INVALID);
 		malloc_mutex_unlock(&prof_dump_seq_mtx);
 		prof_dump(false, filename, opt_prof_leak);
 	}
@@ -1056,14 +1196,6 @@ prof_bt_keycomp(const void *k1, const void *k2)
 	return (memcmp(bt1->vec, bt2->vec, bt1->len * sizeof(void *)) == 0);
 }
 
-static malloc_mutex_t *
-prof_ctx_mutex_choose(void)
-{
-	unsigned nctxs = atomic_add_u(&cum_ctxs, 1);
-
-	return (&ctx_locks[(nctxs - 1) % PROF_NCTX_LOCKS]);
-}
-
 prof_tdata_t *
 prof_tdata_init(void)
 {
@@ -1208,6 +1340,8 @@ prof_boot2(void)
 
 		if (malloc_mutex_init(&prof_dump_seq_mtx))
 			return (true);
+		if (malloc_mutex_init(&prof_dump_mtx))
+			return (true);
 
 		if (atexit(prof_fdump) != 0) {
 			malloc_write("<jemalloc>: Error in atexit()\n");
@@ -1245,10 +1379,10 @@ prof_prefork(void)
 	if (opt_prof) {
 		unsigned i;
 
-		malloc_mutex_lock(&bt2ctx_mtx);
-		malloc_mutex_lock(&prof_dump_seq_mtx);
+		malloc_mutex_prefork(&bt2ctx_mtx);
+		malloc_mutex_prefork(&prof_dump_seq_mtx);
 		for (i = 0; i < PROF_NCTX_LOCKS; i++)
-			malloc_mutex_lock(&ctx_locks[i]);
+			malloc_mutex_prefork(&ctx_locks[i]);
 	}
 }
 
diff --git a/src/quarantine.c b/src/quarantine.c
index f96a948d..54315116 100644
--- a/src/quarantine.c
+++ b/src/quarantine.c
@@ -141,8 +141,17 @@ quarantine(void *ptr)
 		obj->usize = usize;
 		quarantine->curbytes += usize;
 		quarantine->curobjs++;
-		if (opt_junk)
-			memset(ptr, 0x5a, usize);
+		if (config_fill && opt_junk) {
+			/*
+			 * Only do redzone validation if Valgrind isn't in
+			 * operation.
+			 */
+			if ((config_valgrind == false || opt_valgrind == false)
+			    && usize <= SMALL_MAXCLASS)
+				arena_quarantine_junk_small(ptr, usize);
+			else
+				memset(ptr, 0x5a, usize);
+		}
 	} else {
 		assert(quarantine->curbytes == 0);
 		idalloc(ptr);
diff --git a/src/rtree.c b/src/rtree.c
index 90c6935a..205957ac 100644
--- a/src/rtree.c
+++ b/src/rtree.c
@@ -2,42 +2,55 @@
 #include "jemalloc/internal/jemalloc_internal.h"
 
 rtree_t *
-rtree_new(unsigned bits)
+rtree_new(unsigned bits, rtree_alloc_t *alloc, rtree_dalloc_t *dalloc)
 {
 	rtree_t *ret;
-	unsigned bits_per_level, height, i;
+	unsigned bits_per_level, bits_in_leaf, height, i;
+
+	assert(bits > 0 && bits <= (sizeof(uintptr_t) << 3));
 
 	bits_per_level = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
-	height = bits / bits_per_level;
-	if (height * bits_per_level != bits)
-		height++;
-	assert(height * bits_per_level >= bits);
+	bits_in_leaf = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(uint8_t)))) - 1;
+	if (bits > bits_in_leaf) {
+		height = 1 + (bits - bits_in_leaf) / bits_per_level;
+		if ((height-1) * bits_per_level + bits_in_leaf != bits)
+			height++;
+	} else {
+		height = 1;
+	}
+	assert((height-1) * bits_per_level + bits_in_leaf >= bits);
 
-	ret = (rtree_t*)base_alloc(offsetof(rtree_t, level2bits) +
+	ret = (rtree_t*)alloc(offsetof(rtree_t, level2bits) +
 	    (sizeof(unsigned) * height));
 	if (ret == NULL)
 		return (NULL);
 	memset(ret, 0, offsetof(rtree_t, level2bits) + (sizeof(unsigned) *
 	    height));
 
+	ret->alloc = alloc;
+	ret->dalloc = dalloc;
 	if (malloc_mutex_init(&ret->mutex)) {
-		/* Leak the rtree. */
+		if (dalloc != NULL)
+			dalloc(ret);
 		return (NULL);
 	}
 	ret->height = height;
-	if (bits_per_level * height > bits)
-		ret->level2bits[0] = bits % bits_per_level;
-	else
-		ret->level2bits[0] = bits_per_level;
-	for (i = 1; i < height; i++)
-		ret->level2bits[i] = bits_per_level;
-
-	ret->root = (void**)base_alloc(sizeof(void *) << ret->level2bits[0]);
+	if (height > 1) {
+		if ((height-1) * bits_per_level + bits_in_leaf > bits) {
+			ret->level2bits[0] = (bits - bits_in_leaf) %
+			    bits_per_level;
+		} else
+			ret->level2bits[0] = bits_per_level;
+		for (i = 1; i < height-1; i++)
+			ret->level2bits[i] = bits_per_level;
+		ret->level2bits[height-1] = bits_in_leaf;
+	} else
+		ret->level2bits[0] = bits;
+
+	ret->root = (void**)alloc(sizeof(void *) << ret->level2bits[0]);
 	if (ret->root == NULL) {
-		/*
-		 * We leak the rtree here, since there's no generic base
-		 * deallocation.
-		 */
+		if (dalloc != NULL)
+			dalloc(ret);
 		return (NULL);
 	}
 	memset(ret->root, 0, sizeof(void *) << ret->level2bits[0]);
@@ -45,6 +58,31 @@ rtree_new(unsigned bits)
 	return (ret);
 }
 
+static void
+rtree_delete_subtree(rtree_t *rtree, void **node, unsigned level)
+{
+
+	if (level < rtree->height - 1) {
+		size_t nchildren, i;
+
+		nchildren = ZU(1) << rtree->level2bits[level];
+		for (i = 0; i < nchildren; i++) {
+			void **child = (void **)node[i];
+			if (child != NULL)
+				rtree_delete_subtree(rtree, child, level + 1);
+		}
+	}
+	rtree->dalloc(node);
+}
+
+void
+rtree_delete(rtree_t *rtree)
+{
+
+	rtree_delete_subtree(rtree, rtree->root, 0);
+	rtree->dalloc(rtree);
+}
+
 void
 rtree_prefork(rtree_t *rtree)
 {
diff --git a/src/stats.c b/src/stats.c
index 43f87af6..bef2ab33 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -345,25 +345,25 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		malloc_cprintf(write_cb, cbopaque, "Assertions %s\n",
 		    bv ? "enabled" : "disabled");
 
-#define OPT_WRITE_BOOL(n)						\
+#define	OPT_WRITE_BOOL(n)						\
 		if ((err = je_mallctl("opt."#n, &bv, &bsz, NULL, 0))	\
 		    == 0) {						\
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": %s\n", bv ? "true" : "false");	\
 		}
-#define OPT_WRITE_SIZE_T(n)						\
+#define	OPT_WRITE_SIZE_T(n)						\
 		if ((err = je_mallctl("opt."#n, &sv, &ssz, NULL, 0))	\
 		    == 0) {						\
 			malloc_cprintf(write_cb, cbopaque,		\
 			"  opt."#n": %zu\n", sv);			\
 		}
-#define OPT_WRITE_SSIZE_T(n)						\
+#define	OPT_WRITE_SSIZE_T(n)						\
 		if ((err = je_mallctl("opt."#n, &ssv, &sssz, NULL, 0))	\
 		    == 0) {						\
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": %zd\n", ssv);			\
 		}
-#define OPT_WRITE_CHAR_P(n)						\
+#define	OPT_WRITE_CHAR_P(n)						\
 		if ((err = je_mallctl("opt."#n, &cpv, &cpsz, NULL, 0))	\
 		    == 0) {						\
 			malloc_cprintf(write_cb, cbopaque,		\
diff --git a/src/tcache.c b/src/tcache.c
index 98ed19ed..6de92960 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -260,8 +260,8 @@ tcache_arena_dissociate(tcache_t *tcache)
 		/* Unlink from list of extant tcaches. */
 		malloc_mutex_lock(&tcache->arena->lock);
 		ql_remove(&tcache->arena->tcache_ql, tcache, link);
-		malloc_mutex_unlock(&tcache->arena->lock);
 		tcache_stats_merge(tcache, tcache->arena);
+		malloc_mutex_unlock(&tcache->arena->lock);
 	}
 }
 
@@ -292,7 +292,7 @@ tcache_create(arena_t *arena)
 	else if (size <= tcache_maxclass)
 		tcache = (tcache_t *)arena_malloc_large(arena, size, true);
 	else
-		tcache = (tcache_t *)icallocx(size, false, arena);
+		tcache = (tcache_t *)icalloct(size, false, arena);
 
 	if (tcache == NULL)
 		return (NULL);
@@ -366,7 +366,7 @@ tcache_destroy(tcache_t *tcache)
 
 		arena_dalloc_large(arena, chunk, tcache);
 	} else
-		idallocx(tcache, false);
+		idalloct(tcache, false);
 }
 
 void
@@ -399,11 +399,14 @@ tcache_thread_cleanup(void *arg)
 	}
 }
 
+/* Caller must own arena->lock. */
 void
 tcache_stats_merge(tcache_t *tcache, arena_t *arena)
 {
 	unsigned i;
 
+	cassert(config_stats);
+
 	/* Merge and reset tcache stats. */
 	for (i = 0; i < NBINS; i++) {
 		arena_bin_t *bin = &arena->bins[i];
diff --git a/src/tsd.c b/src/tsd.c
index 961a5463..700caabf 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -21,7 +21,7 @@ void
 malloc_tsd_dalloc(void *wrapper)
 {
 
-	idalloc(wrapper);
+	idalloct(wrapper, false);
 }
 
 void
@@ -105,3 +105,37 @@ JEMALLOC_SECTION(".CRT$XLY") JEMALLOC_ATTR(used)
 static const BOOL	(WINAPI *tls_callback)(HINSTANCE hinstDLL,
     DWORD fdwReason, LPVOID lpvReserved) = _tls_callback;
 #endif
+
+#if (!defined(JEMALLOC_MALLOC_THREAD_CLEANUP) && !defined(JEMALLOC_TLS) && \
+    !defined(_WIN32))
+void *
+tsd_init_check_recursion(tsd_init_head_t *head, tsd_init_block_t *block)
+{
+	pthread_t self = pthread_self();
+	tsd_init_block_t *iter;
+
+	/* Check whether this thread has already inserted into the list. */
+	malloc_mutex_lock(&head->lock);
+	ql_foreach(iter, &head->blocks, link) {
+		if (iter->thread == self) {
+			malloc_mutex_unlock(&head->lock);
+			return (iter->data);
+		}
+	}
+	/* Insert block into list. */
+	ql_elm_new(block, link);
+	block->thread = self;
+	ql_tail_insert(&head->blocks, block, link);
+	malloc_mutex_unlock(&head->lock);
+	return (NULL);
+}
+
+void
+tsd_init_finish(tsd_init_head_t *head, tsd_init_block_t *block)
+{
+
+	malloc_mutex_lock(&head->lock);
+	ql_remove(&head->blocks, block, link);
+	malloc_mutex_unlock(&head->lock);
+}
+#endif
diff --git a/src/util.c b/src/util.c
index b3a01143..93a19fd1 100644
--- a/src/util.c
+++ b/src/util.c
@@ -77,7 +77,7 @@ malloc_write(const char *s)
  * provide a wrapper.
  */
 int
-buferror(char *buf, size_t buflen)
+buferror(int err, char *buf, size_t buflen)
 {
 
 #ifdef _WIN32
@@ -85,34 +85,36 @@ buferror(char *buf, size_t buflen)
 	    (LPSTR)buf, buflen, NULL);
 	return (0);
 #elif defined(_GNU_SOURCE)
-	char *b = strerror_r(errno, buf, buflen);
+	char *b = strerror_r(err, buf, buflen);
 	if (b != buf) {
 		strncpy(buf, b, buflen);
 		buf[buflen-1] = '\0';
 	}
 	return (0);
 #else
-	return (strerror_r(errno, buf, buflen));
+	return (strerror_r(err, buf, buflen));
 #endif
 }
 
 uintmax_t
-malloc_strtoumax(const char *nptr, char **endptr, int base)
+malloc_strtoumax(const char *restrict nptr, char **restrict endptr, int base)
 {
 	uintmax_t ret, digit;
 	int b;
 	bool neg;
 	const char *p, *ns;
 
+	p = nptr;
 	if (base < 0 || base == 1 || base > 36) {
+		ns = p;
 		set_errno(EINVAL);
-		return (UINTMAX_MAX);
+		ret = UINTMAX_MAX;
+		goto label_return;
 	}
 	b = base;
 
 	/* Swallow leading whitespace and get sign, if any. */
 	neg = false;
-	p = nptr;
 	while (true) {
 		switch (*p) {
 		case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
@@ -146,7 +148,7 @@ malloc_strtoumax(const char *nptr, char **endptr, int base)
 			if (b == 8)
 				p++;
 			break;
-		case 'x':
+		case 'X': case 'x':
 			switch (p[2]) {
 			case '0': case '1': case '2': case '3': case '4':
 			case '5': case '6': case '7': case '8': case '9':
@@ -164,7 +166,9 @@ malloc_strtoumax(const char *nptr, char **endptr, int base)
 			}
 			break;
 		default:
-			break;
+			p++;
+			ret = 0;
+			goto label_return;
 		}
 	}
 	if (b == 0)
@@ -181,13 +185,22 @@ malloc_strtoumax(const char *nptr, char **endptr, int base)
 		if (ret < pret) {
 			/* Overflow. */
 			set_errno(ERANGE);
-			return (UINTMAX_MAX);
+			ret = UINTMAX_MAX;
+			goto label_return;
 		}
 		p++;
 	}
 	if (neg)
 		ret = -ret;
 
+	if (p == ns) {
+		/* No conversion performed. */
+		set_errno(EINVAL);
+		ret = UINTMAX_MAX;
+		goto label_return;
+	}
+
+label_return:
 	if (endptr != NULL) {
 		if (p == ns) {
 			/* No characters were converted. */
@@ -195,7 +208,6 @@ malloc_strtoumax(const char *nptr, char **endptr, int base)
 		} else
 			*endptr = (char *)p;
 	}
-
 	return (ret);
 }
 
@@ -331,7 +343,7 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 			APPEND_C(' ');					\
 	}								\
 } while (0)
-#define GET_ARG_NUMERIC(val, len) do {					\
+#define	GET_ARG_NUMERIC(val, len) do {					\
 	switch (len) {							\
 	case '?':							\
 		val = va_arg(ap, int);					\
@@ -354,6 +366,9 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 	case 'j':							\
 		val = va_arg(ap, intmax_t);				\
 		break;							\
+	case 'j' | 0x80:						\
+		val = va_arg(ap, uintmax_t);				\
+		break;							\
 	case 't':							\
 		val = va_arg(ap, ptrdiff_t);				\
 		break;							\
@@ -385,11 +400,6 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 			unsigned char len = '?';
 
 			f++;
-			if (*f == '%') {
-				/* %% */
-				APPEND_C(*f);
-				break;
-			}
 			/* Flags. */
 			while (true) {
 				switch (*f) {
@@ -419,6 +429,10 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 			case '*':
 				width = va_arg(ap, int);
 				f++;
+				if (width < 0) {
+					left_justify = true;
+					width = -width;
+				}
 				break;
 			case '0': case '1': case '2': case '3': case '4':
 			case '5': case '6': case '7': case '8': case '9': {
@@ -428,19 +442,16 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 				assert(uwidth != UINTMAX_MAX || get_errno() !=
 				    ERANGE);
 				width = (int)uwidth;
-				if (*f == '.') {
-					f++;
-					goto label_precision;
-				} else
-					goto label_length;
 				break;
-			} case '.':
-				f++;
-				goto label_precision;
-			default: goto label_length;
+			} default:
+				break;
 			}
+			/* Width/precision separator. */
+			if (*f == '.')
+				f++;
+			else
+				goto label_length;
 			/* Precision. */
-			label_precision:
 			switch (*f) {
 			case '*':
 				prec = va_arg(ap, int);
@@ -469,16 +480,8 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 				} else
 					len = 'l';
 				break;
-			case 'j':
-				len = 'j';
-				f++;
-				break;
-			case 't':
-				len = 't';
-				f++;
-				break;
-			case 'z':
-				len = 'z';
+			case 'q': case 'j': case 't': case 'z':
+				len = *f;
 				f++;
 				break;
 			default: break;
@@ -487,6 +490,11 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 			switch (*f) {
 				char *s;
 				size_t slen;
+			case '%':
+				/* %% */
+				APPEND_C(*f);
+				f++;
+				break;
 			case 'd': case 'i': {
 				intmax_t val JEMALLOC_CC_SILENCE_INIT(0);
 				char buf[D2S_BUFSIZE];
@@ -540,7 +548,7 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 				assert(len == '?' || len == 'l');
 				assert_not_implemented(len != 'l');
 				s = va_arg(ap, char *);
-				slen = (prec == -1) ? strlen(s) : prec;
+				slen = (prec < 0) ? strlen(s) : prec;
 				APPEND_PADDED_S(s, slen, width, left_justify);
 				f++;
 				break;
@@ -553,8 +561,7 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 				APPEND_PADDED_S(s, slen, width, left_justify);
 				f++;
 				break;
-			}
-			default: not_implemented();
+			} default: not_reached();
 			}
 			break;
 		} default: {
diff --git a/src/zone.c b/src/zone.c
index c62c183f..e0302ef4 100644
--- a/src/zone.c
+++ b/src/zone.c
@@ -137,7 +137,7 @@ zone_destroy(malloc_zone_t *zone)
 {
 
 	/* This function should never be called. */
-	assert(false);
+	not_reached();
 	return (NULL);
 }