summaryrefslogtreecommitdiff
path: root/deps/jemalloc/include/jemalloc/internal/tsd.h
diff options
context:
space:
mode:
Diffstat (limited to 'deps/jemalloc/include/jemalloc/internal/tsd.h')
-rw-r--r--deps/jemalloc/include/jemalloc/internal/tsd.h243
1 files changed, 173 insertions, 70 deletions
diff --git a/deps/jemalloc/include/jemalloc/internal/tsd.h b/deps/jemalloc/include/jemalloc/internal/tsd.h
index 9ba260045..66d688227 100644
--- a/deps/jemalloc/include/jemalloc/internal/tsd.h
+++ b/deps/jemalloc/include/jemalloc/internal/tsd.h
@@ -1,10 +1,12 @@
#ifndef JEMALLOC_INTERNAL_TSD_H
#define JEMALLOC_INTERNAL_TSD_H
+#include "jemalloc/internal/activity_callback.h"
#include "jemalloc/internal/arena_types.h"
#include "jemalloc/internal/assert.h"
#include "jemalloc/internal/bin_types.h"
#include "jemalloc/internal/jemalloc_internal_externs.h"
+#include "jemalloc/internal/peak.h"
#include "jemalloc/internal/prof_types.h"
#include "jemalloc/internal/ql.h"
#include "jemalloc/internal/rtree_tsd.h"
@@ -15,39 +17,30 @@
/*
* Thread-Specific-Data layout
- * --- data accessed on tcache fast path: state, rtree_ctx, stats, prof ---
- * s: state
- * e: tcache_enabled
- * m: thread_allocated (config_stats)
- * f: thread_deallocated (config_stats)
- * p: prof_tdata (config_prof)
- * c: rtree_ctx (rtree cache accessed on deallocation)
- * t: tcache
- * --- data not accessed on tcache fast path: arena-related fields ---
- * d: arenas_tdata_bypass
- * r: reentrancy_level
- * x: narenas_tdata
- * i: iarena
- * a: arena
- * o: arenas_tdata
- * Loading TSD data is on the critical path of basically all malloc operations.
- * In particular, tcache and rtree_ctx rely on hot CPU cache to be effective.
- * Use a compact layout to reduce cache footprint.
- * +--- 64-bit and 64B cacheline; 1B each letter; First byte on the left. ---+
- * |---------------------------- 1st cacheline ----------------------------|
- * | sedrxxxx mmmmmmmm ffffffff pppppppp [c * 32 ........ ........ .......] |
- * |---------------------------- 2nd cacheline ----------------------------|
- * | [c * 64 ........ ........ ........ ........ ........ ........ .......] |
- * |---------------------------- 3nd cacheline ----------------------------|
- * | [c * 32 ........ ........ .......] iiiiiiii aaaaaaaa oooooooo [t...... |
- * +-------------------------------------------------------------------------+
- * Note: the entire tcache is embedded into TSD and spans multiple cachelines.
*
- * The last 3 members (i, a and o) before tcache isn't really needed on tcache
- * fast path. However we have a number of unused tcache bins and witnesses
- * (never touched unless config_debug) at the end of tcache, so we place them
- * there to avoid breaking the cachelines and possibly paging in an extra page.
+ * At least some thread-local data gets touched on the fast-path of almost all
+ * malloc operations. But much of it is only necessary down slow-paths, or
+ * testing. We want to colocate the fast-path data so that it can live on the
+ * same cacheline if possible. So we define three tiers of hotness:
+ * TSD_DATA_FAST: Touched on the alloc/dalloc fast paths.
+ * TSD_DATA_SLOW: Touched down slow paths. "Slow" here is sort of general;
+ * there are "semi-slow" paths like "not a sized deallocation, but can still
+ * live in the tcache". We'll want to keep these closer to the fast-path
+ * data.
+ * TSD_DATA_SLOWER: Only touched in test or debug modes, or not touched at all.
+ *
+ * An additional concern is that the larger tcache bins won't be used (we have a
+ * bin per size class, but by default only cache relatively small objects). So
+ * the earlier bins are in the TSD_DATA_FAST tier, but the later ones are in the
+ * TSD_DATA_SLOWER tier.
+ *
+ * As a result of all this, we put the slow data first, then the fast data, then
+ * the slower data, while keeping the tcache as the last element of the fast
+ * data (so that the fast -> slower transition happens midway through the
+ * tcache). While we don't yet play alignment tricks to guarantee it, this
+ * increases our odds of getting some cache/page locality on fast paths.
*/
+
#ifdef JEMALLOC_JET
typedef void (*test_callback_t)(int *);
# define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10
@@ -60,50 +53,112 @@ typedef void (*test_callback_t)(int *);
# define MALLOC_TEST_TSD_INITIALIZER
#endif
-/* O(name, type, nullable type */
-#define MALLOC_TSD \
+typedef ql_elm(tsd_t) tsd_link_t;
+
+/* O(name, type, nullable type) */
+#define TSD_DATA_SLOW \
O(tcache_enabled, bool, bool) \
- O(arenas_tdata_bypass, bool, bool) \
O(reentrancy_level, int8_t, int8_t) \
- O(narenas_tdata, uint32_t, uint32_t) \
- O(offset_state, uint64_t, uint64_t) \
- O(thread_allocated, uint64_t, uint64_t) \
- O(thread_deallocated, uint64_t, uint64_t) \
- O(bytes_until_sample, int64_t, int64_t) \
+ O(thread_allocated_last_event, uint64_t, uint64_t) \
+ O(thread_allocated_next_event, uint64_t, uint64_t) \
+ O(thread_deallocated_last_event, uint64_t, uint64_t) \
+ O(thread_deallocated_next_event, uint64_t, uint64_t) \
+ O(tcache_gc_event_wait, uint64_t, uint64_t) \
+ O(tcache_gc_dalloc_event_wait, uint64_t, uint64_t) \
+ O(prof_sample_event_wait, uint64_t, uint64_t) \
+ O(prof_sample_last_event, uint64_t, uint64_t) \
+ O(stats_interval_event_wait, uint64_t, uint64_t) \
+ O(stats_interval_last_event, uint64_t, uint64_t) \
+ O(peak_alloc_event_wait, uint64_t, uint64_t) \
+ O(peak_dalloc_event_wait, uint64_t, uint64_t) \
O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \
- O(rtree_ctx, rtree_ctx_t, rtree_ctx_t) \
+ O(prng_state, uint64_t, uint64_t) \
+ O(san_extents_until_guard_small, uint64_t, uint64_t) \
+ O(san_extents_until_guard_large, uint64_t, uint64_t) \
O(iarena, arena_t *, arena_t *) \
O(arena, arena_t *, arena_t *) \
- O(arenas_tdata, arena_tdata_t *, arena_tdata_t *)\
+ O(arena_decay_ticker, ticker_geom_t, ticker_geom_t) \
+ O(sec_shard, uint8_t, uint8_t) \
O(binshards, tsd_binshards_t, tsd_binshards_t)\
- O(tcache, tcache_t, tcache_t) \
+ O(tsd_link, tsd_link_t, tsd_link_t) \
+ O(in_hook, bool, bool) \
+ O(peak, peak_t, peak_t) \
+ O(activity_callback_thunk, activity_callback_thunk_t, \
+ activity_callback_thunk_t) \
+ O(tcache_slow, tcache_slow_t, tcache_slow_t) \
+ O(rtree_ctx, rtree_ctx_t, rtree_ctx_t)
+
+#define TSD_DATA_SLOW_INITIALIZER \
+ /* tcache_enabled */ TCACHE_ENABLED_ZERO_INITIALIZER, \
+ /* reentrancy_level */ 0, \
+ /* thread_allocated_last_event */ 0, \
+ /* thread_allocated_next_event */ 0, \
+ /* thread_deallocated_last_event */ 0, \
+ /* thread_deallocated_next_event */ 0, \
+ /* tcache_gc_event_wait */ 0, \
+ /* tcache_gc_dalloc_event_wait */ 0, \
+ /* prof_sample_event_wait */ 0, \
+ /* prof_sample_last_event */ 0, \
+ /* stats_interval_event_wait */ 0, \
+ /* stats_interval_last_event */ 0, \
+ /* peak_alloc_event_wait */ 0, \
+ /* peak_dalloc_event_wait */ 0, \
+ /* prof_tdata */ NULL, \
+ /* prng_state */ 0, \
+ /* san_extents_until_guard_small */ 0, \
+ /* san_extents_until_guard_large */ 0, \
+ /* iarena */ NULL, \
+ /* arena */ NULL, \
+ /* arena_decay_ticker */ \
+ TICKER_GEOM_INIT(ARENA_DECAY_NTICKS_PER_UPDATE), \
+ /* sec_shard */ (uint8_t)-1, \
+ /* binshards */ TSD_BINSHARDS_ZERO_INITIALIZER, \
+ /* tsd_link */ {NULL}, \
+ /* in_hook */ false, \
+ /* peak */ PEAK_INITIALIZER, \
+ /* activity_callback_thunk */ \
+ ACTIVITY_CALLBACK_THUNK_INITIALIZER, \
+ /* tcache_slow */ TCACHE_SLOW_ZERO_INITIALIZER, \
+ /* rtree_ctx */ RTREE_CTX_INITIALIZER,
+
+/* O(name, type, nullable type) */
+#define TSD_DATA_FAST \
+ O(thread_allocated, uint64_t, uint64_t) \
+ O(thread_allocated_next_event_fast, uint64_t, uint64_t) \
+ O(thread_deallocated, uint64_t, uint64_t) \
+ O(thread_deallocated_next_event_fast, uint64_t, uint64_t) \
+ O(tcache, tcache_t, tcache_t)
+
+#define TSD_DATA_FAST_INITIALIZER \
+ /* thread_allocated */ 0, \
+ /* thread_allocated_next_event_fast */ 0, \
+ /* thread_deallocated */ 0, \
+ /* thread_deallocated_next_event_fast */ 0, \
+ /* tcache */ TCACHE_ZERO_INITIALIZER,
+
+/* O(name, type, nullable type) */
+#define TSD_DATA_SLOWER \
O(witness_tsd, witness_tsd_t, witness_tsdn_t) \
MALLOC_TEST_TSD
+#define TSD_DATA_SLOWER_INITIALIZER \
+ /* witness */ WITNESS_TSD_INITIALIZER \
+ /* test data */ MALLOC_TEST_TSD_INITIALIZER
+
+
#define TSD_INITIALIZER { \
- ATOMIC_INIT(tsd_state_uninitialized), \
- TCACHE_ENABLED_ZERO_INITIALIZER, \
- false, \
- 0, \
- 0, \
- 0, \
- 0, \
- 0, \
- 0, \
- NULL, \
- RTREE_CTX_ZERO_INITIALIZER, \
- NULL, \
- NULL, \
- NULL, \
- TSD_BINSHARDS_ZERO_INITIALIZER, \
- TCACHE_ZERO_INITIALIZER, \
- WITNESS_TSD_INITIALIZER \
- MALLOC_TEST_TSD_INITIALIZER \
+ TSD_DATA_SLOW_INITIALIZER \
+ /* state */ ATOMIC_INIT(tsd_state_uninitialized), \
+ TSD_DATA_FAST_INITIALIZER \
+ TSD_DATA_SLOWER_INITIALIZER \
}
+#if defined(JEMALLOC_MALLOC_THREAD_CLEANUP) || defined(_WIN32)
+void _malloc_tsd_cleanup_register(bool (*f)(void));
+#endif
+
void *malloc_tsd_malloc(size_t size);
void malloc_tsd_dalloc(void *wrapper);
-void malloc_tsd_cleanup_register(bool (*f)(void));
tsd_t *malloc_tsd_boot0(void);
void malloc_tsd_boot1(void);
void tsd_cleanup(void *arg);
@@ -189,14 +244,17 @@ struct tsd_s {
* setters below.
*/
+#define O(n, t, nt) \
+ t TSD_MANGLE(n);
+
+ TSD_DATA_SLOW
/*
* We manually limit the state to just a single byte. Unless the 8-bit
* atomics are unavailable (which is rare).
*/
tsd_state_t state;
-#define O(n, t, nt) \
- t TSD_MANGLE(n);
-MALLOC_TSD
+ TSD_DATA_FAST
+ TSD_DATA_SLOWER
#undef O
};
@@ -262,7 +320,9 @@ JEMALLOC_ALWAYS_INLINE t * \
tsd_##n##p_get_unsafe(tsd_t *tsd) { \
return &tsd->TSD_MANGLE(n); \
}
-MALLOC_TSD
+TSD_DATA_SLOW
+TSD_DATA_FAST
+TSD_DATA_SLOWER
#undef O
/* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */
@@ -281,7 +341,9 @@ tsd_##n##p_get(tsd_t *tsd) { \
state == tsd_state_minimal_initialized); \
return tsd_##n##p_get_unsafe(tsd); \
}
-MALLOC_TSD
+TSD_DATA_SLOW
+TSD_DATA_FAST
+TSD_DATA_SLOWER
#undef O
/*
@@ -297,7 +359,9 @@ tsdn_##n##p_get(tsdn_t *tsdn) { \
tsd_t *tsd = tsdn_tsd(tsdn); \
return (nt *)tsd_##n##p_get(tsd); \
}
-MALLOC_TSD
+TSD_DATA_SLOW
+TSD_DATA_FAST
+TSD_DATA_SLOWER
#undef O
/* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */
@@ -306,7 +370,9 @@ JEMALLOC_ALWAYS_INLINE t \
tsd_##n##_get(tsd_t *tsd) { \
return *tsd_##n##p_get(tsd); \
}
-MALLOC_TSD
+TSD_DATA_SLOW
+TSD_DATA_FAST
+TSD_DATA_SLOWER
#undef O
/* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */
@@ -317,7 +383,9 @@ tsd_##n##_set(tsd_t *tsd, t val) { \
tsd_state_get(tsd) != tsd_state_minimal_initialized); \
*tsd_##n##p_get(tsd) = val; \
}
-MALLOC_TSD
+TSD_DATA_SLOW
+TSD_DATA_FAST
+TSD_DATA_SLOWER
#undef O
JEMALLOC_ALWAYS_INLINE void
@@ -382,7 +450,10 @@ tsd_fetch(void) {
static inline bool
tsd_nominal(tsd_t *tsd) {
- return (tsd_state_get(tsd) <= tsd_state_nominal_max);
+ bool nominal = tsd_state_get(tsd) <= tsd_state_nominal_max;
+ assert(nominal || tsd_reentrancy_level_get(tsd) > 0);
+
+ return nominal;
}
JEMALLOC_ALWAYS_INLINE tsdn_t *
@@ -412,4 +483,36 @@ tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback) {
return tsd_rtree_ctx(tsdn_tsd(tsdn));
}
+static inline bool
+tsd_state_nocleanup(tsd_t *tsd) {
+ return tsd_state_get(tsd) == tsd_state_reincarnated ||
+ tsd_state_get(tsd) == tsd_state_minimal_initialized;
+}
+
+/*
+ * These "raw" tsd reentrancy functions don't have any debug checking to make
+ * sure that we're not touching arena 0. Better is to call pre_reentrancy and
+ * post_reentrancy if this is possible.
+ */
+static inline void
+tsd_pre_reentrancy_raw(tsd_t *tsd) {
+ bool fast = tsd_fast(tsd);
+ assert(tsd_reentrancy_level_get(tsd) < INT8_MAX);
+ ++*tsd_reentrancy_levelp_get(tsd);
+ if (fast) {
+ /* Prepare slow path for reentrancy. */
+ tsd_slow_update(tsd);
+ assert(tsd_state_get(tsd) == tsd_state_nominal_slow);
+ }
+}
+
+static inline void
+tsd_post_reentrancy_raw(tsd_t *tsd) {
+ int8_t *reentrancy_level = tsd_reentrancy_levelp_get(tsd);
+ assert(*reentrancy_level > 0);
+ if (--*reentrancy_level == 0) {
+ tsd_slow_update(tsd);
+ }
+}
+
#endif /* JEMALLOC_INTERNAL_TSD_H */