summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorQi Wang <interwq@gwu.edu>2022-10-21 15:10:48 -0700
committerQi Wang <interwq@gmail.com>2022-10-25 09:54:38 -0700
commit143e9c4a2f4eb8916e9802323485fd91260fd17c (patch)
tree9e7b4e57c92fa8a62be1e3005be9887476ecc9e0
parentbe65438f20a5fe4fdc5c5bb2cfa7ba3f0e9da378 (diff)
downloadjemalloc-143e9c4a2f4eb8916e9802323485fd91260fd17c.tar.gz
Enable fast thread locals for dealloc-only threads.
Previously if a thread does only allocations, it stays on the slow path / minimal initialized state forever. However, dealloc-only is a valid pattern for dedicated reclamation threads -- this means thread cache is disabled (no batched flush) for them, which causes high overhead and contention. Added the condition to fully initialize TSD when a fair amount of dealloc activities are observed.
-rw-r--r--include/jemalloc/internal/tsd.h4
-rw-r--r--src/tsd.c18
-rw-r--r--test/unit/tsd.c56
3 files changed, 77 insertions, 1 deletions
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 66d68822..c6bf28fc 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -59,6 +59,7 @@ typedef ql_elm(tsd_t) tsd_link_t;
#define TSD_DATA_SLOW \
O(tcache_enabled, bool, bool) \
O(reentrancy_level, int8_t, int8_t) \
+ O(min_init_state_nfetched, uint8_t, uint8_t) \
O(thread_allocated_last_event, uint64_t, uint64_t) \
O(thread_allocated_next_event, uint64_t, uint64_t) \
O(thread_deallocated_last_event, uint64_t, uint64_t) \
@@ -91,6 +92,7 @@ typedef ql_elm(tsd_t) tsd_link_t;
#define TSD_DATA_SLOW_INITIALIZER \
/* tcache_enabled */ TCACHE_ENABLED_ZERO_INITIALIZER, \
/* reentrancy_level */ 0, \
+ /* min_init_state_nfetched */ 0, \
/* thread_allocated_last_event */ 0, \
/* thread_allocated_next_event */ 0, \
/* thread_deallocated_last_event */ 0, \
@@ -177,6 +179,8 @@ void tsd_global_slow_inc(tsdn_t *tsdn);
void tsd_global_slow_dec(tsdn_t *tsdn);
bool tsd_global_slow();
+#define TSD_MIN_INIT_STATE_MAX_FETCHED (128)
+
enum {
/* Common case --> jnz. */
tsd_state_nominal = 0,
diff --git a/src/tsd.c b/src/tsd.c
index e8e4f3a3..cef7ba58 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -300,9 +300,25 @@ tsd_fetch_slow(tsd_t *tsd, bool minimal) {
tsd_state_set(tsd, tsd_state_minimal_initialized);
tsd_set(tsd);
tsd_data_init_nocleanup(tsd);
+ *tsd_min_init_state_nfetchedp_get(tsd) = 1;
}
} else if (tsd_state_get(tsd) == tsd_state_minimal_initialized) {
- if (!minimal) {
+ /*
+ * If a thread only ever deallocates (e.g. dedicated reclamation
+ * threads), we want to help it to eventually escape the slow
+ * path (caused by the minimal initialized state). The nfetched
+ * counter tracks the number of times the tsd has been accessed
+ * under the min init state, and triggers the switch to nominal
+ * once reached the max allowed count.
+ *
+ * This means at most 128 deallocations stay on the slow path.
+ *
+ * Also see comments in free_default().
+ */
+ uint8_t *nfetched = tsd_min_init_state_nfetchedp_get(tsd);
+ assert(*nfetched >= 1);
+ (*nfetched)++;
+ if (!minimal || *nfetched == TSD_MIN_INIT_STATE_MAX_FETCHED) {
/* Switch to fully initialized. */
tsd_state_set(tsd, tsd_state_nominal);
assert(*tsd_reentrancy_levelp_get(tsd) >= 1);
diff --git a/test/unit/tsd.c b/test/unit/tsd.c
index 205d8708..bb5cd9f6 100644
--- a/test/unit/tsd.c
+++ b/test/unit/tsd.c
@@ -136,6 +136,61 @@ TEST_BEGIN(test_tsd_reincarnation) {
}
TEST_END
+static void *
+thd_start_dalloc_only(void *arg) {
+ void **ptrs = (void **)arg;
+
+ tsd_t *tsd = tsd_fetch_min();
+ if (tsd_state_get(tsd) != tsd_state_minimal_initialized) {
+ /* Allocation happened implicitly. */
+ expect_u_eq(tsd_state_get(tsd), tsd_state_nominal,
+ "TSD state should be nominal");
+ return NULL;
+ }
+
+ void *ptr;
+ for (size_t i = 0; (ptr = ptrs[i]) != NULL; i++) {
+ /* Offset by 1 because of the manual tsd_fetch_min above. */
+ if (i + 1 < TSD_MIN_INIT_STATE_MAX_FETCHED) {
+ expect_u_eq(tsd_state_get(tsd),
+ tsd_state_minimal_initialized,
+ "TSD should be minimal initialized");
+ } else {
+ /* State may be nominal or nominal_slow. */
+ expect_true(tsd_nominal(tsd), "TSD should be nominal");
+ }
+ free(ptr);
+ }
+
+ return NULL;
+}
+
+static void
+test_sub_thread_n_dalloc(size_t nptrs) {
+ void **ptrs = (void **)malloc(sizeof(void *) * (nptrs + 1));
+ for (size_t i = 0; i < nptrs; i++) {
+ ptrs[i] = malloc(8);
+ }
+ ptrs[nptrs] = NULL;
+
+ thd_t thd;
+ thd_create(&thd, thd_start_dalloc_only, (void *)ptrs);
+ thd_join(thd, NULL);
+ free(ptrs);
+}
+
+TEST_BEGIN(test_tsd_sub_thread_dalloc_only) {
+ test_sub_thread_n_dalloc(1);
+ test_sub_thread_n_dalloc(16);
+ test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED - 2);
+ test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED - 1);
+ test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED);
+ test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED + 1);
+ test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED + 2);
+ test_sub_thread_n_dalloc(TSD_MIN_INIT_STATE_MAX_FETCHED * 2);
+}
+TEST_END
+
typedef struct {
atomic_u32_t phase;
atomic_b_t error;
@@ -269,6 +324,7 @@ main(void) {
return test_no_reentrancy(
test_tsd_main_thread,
test_tsd_sub_thread,
+ test_tsd_sub_thread_dalloc_only,
test_tsd_reincarnation,
test_tsd_global_slow);
}